diff options
105 files changed, 6653 insertions, 1799 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 104322bf378..34ea4f1fa6e 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -200,12 +200,9 @@ inode_readahead_blks=n This tuning parameter controls the maximum table readahead algorithm will pre-read into the buffer cache. The default value is 32 blocks. -nouser_xattr Disables Extended User Attributes. If you have extended - attribute support enabled in the kernel configuration - (CONFIG_EXT4_FS_XATTR), extended attribute support - is enabled by default on mount. See the attr(5) manual - page and http://acl.bestbits.at/ for more information - about extended attributes. +nouser_xattr Disables Extended User Attributes. See the + attr(5) manual page and http://acl.bestbits.at/ + for more information about extended attributes. noacl This option disables POSIX Access Control List support. If ACL support is enabled in the kernel diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 20e248cc03a..ea8e5b48557 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2032,6 +2032,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nr_uarts= [SERIAL] maximum number of UARTs to be registered. + numa_balancing= [KNL,X86] Enable or disable automatic NUMA balancing. + Allowed values are enable and disable + numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. one of ['zone', 'node', 'default'] can be specified This can be set from sysctl after boot. diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt index 597c3c58137..1e469ef7577 100644 --- a/Documentation/prctl/seccomp_filter.txt +++ b/Documentation/prctl/seccomp_filter.txt @@ -95,12 +95,15 @@ SECCOMP_RET_KILL: SECCOMP_RET_TRAP: Results in the kernel sending a SIGSYS signal to the triggering - task without executing the system call. The kernel will - rollback the register state to just before the system call - entry such that a signal handler in the task will be able to - inspect the ucontext_t->uc_mcontext registers and emulate - system call success or failure upon return from the signal - handler. + task without executing the system call. siginfo->si_call_addr + will show the address of the system call instruction, and + siginfo->si_syscall and siginfo->si_arch will indicate which + syscall was attempted. The program counter will be as though + the syscall happened (i.e. it will not point to the syscall + instruction). The return value register will contain an arch- + dependent value -- if resuming execution, set it to something + sensible. (The architecture dependency is because replacing + it with -ENOSYS could overwrite some useful information.) The SECCOMP_RET_DATA portion of the return value will be passed as si_errno. @@ -123,6 +126,18 @@ SECCOMP_RET_TRACE: the BPF program return value will be available to the tracer via PTRACE_GETEVENTMSG. + The tracer can skip the system call by changing the syscall number + to -1. Alternatively, the tracer can change the system call + requested by changing the system call to a valid syscall number. If + the tracer asks to skip the system call, then the system call will + appear to return the value that the tracer puts in the return value + register. + + The seccomp check will not be run again after the tracer is + notified. (This means that seccomp-based sandboxes MUST NOT + allow use of ptrace, even of other sandboxed processes, without + extreme care; ptracers can use this mechanism to escape.) + SECCOMP_RET_ALLOW: Results in the system call being executed. @@ -161,3 +176,50 @@ architecture supports both ptrace_event and seccomp, it will be able to support seccomp filter with minor fixup: SIGSYS support and seccomp return value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER to its arch-specific Kconfig. + + + +Caveats +------- + +The vDSO can cause some system calls to run entirely in userspace, +leading to surprises when you run programs on different machines that +fall back to real syscalls. To minimize these surprises on x86, make +sure you test with +/sys/devices/system/clocksource/clocksource0/current_clocksource set to +something like acpi_pm. + +On x86-64, vsyscall emulation is enabled by default. (vsyscalls are +legacy variants on vDSO calls.) Currently, emulated vsyscalls will honor seccomp, with a few oddities: + +- A return value of SECCOMP_RET_TRAP will set a si_call_addr pointing to + the vsyscall entry for the given call and not the address after the + 'syscall' instruction. Any code which wants to restart the call + should be aware that (a) a ret instruction has been emulated and (b) + trying to resume the syscall will again trigger the standard vsyscall + emulation security checks, making resuming the syscall mostly + pointless. + +- A return value of SECCOMP_RET_TRACE will signal the tracer as usual, + but the syscall may not be changed to another system call using the + orig_rax register. It may only be changed to -1 order to skip the + currently emulated call. Any other change MAY terminate the process. + The rip value seen by the tracer will be the syscall entry address; + this is different from normal behavior. The tracer MUST NOT modify + rip or rsp. (Do not rely on other changes terminating the process. + They might work. For example, on some kernels, choosing a syscall + that only exists in future kernels will be correctly emulated (by + returning -ENOSYS). + +To detect this quirky behavior, check for addr & ~0x0C00 == +0xFFFFFFFFFF600000. (For SECCOMP_RET_TRACE, use rip. For +SECCOMP_RET_TRAP, use siginfo->si_call_addr.) Do not check any other +condition: future kernels may improve vsyscall emulation and current +kernels in vsyscall=native mode will behave differently, but the +instructions at 0xF...F600{0,4,8,C}00 will not be system calls in these +cases. + +Note that modern systems are unlikely to use vsyscalls at all -- they +are a legacy feature and they are considerably slower than standard +syscalls. New code will use the vDSO, and vDSO-issued system calls +are indistinguishable from normal system calls. diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt index 7d9ca92022d..7b4145d0045 100644 --- a/Documentation/security/keys.txt +++ b/Documentation/security/keys.txt @@ -994,6 +994,23 @@ payload contents" for more information. reference pointer if successful. +(*) A keyring can be created by: + + struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, + const struct cred *cred, + key_perm_t perm, + unsigned long flags, + struct key *dest); + + This creates a keyring with the given attributes and returns it. If dest + is not NULL, the new keyring will be linked into the keyring to which it + points. No permission checks are made upon the destination keyring. + + Error EDQUOT can be returned if the keyring would overload the quota (pass + KEY_ALLOC_NOT_IN_QUOTA in flags if the keyring shouldn't be accounted + towards the user's quota). Error ENOMEM can also be returned. + + (*) To check the validity of a key, this function can be called: int validate_key(struct key *key); diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index cb8f9920f4d..0f7c852f355 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -111,6 +111,7 @@ config VSYSCALL config NUMA bool "Non Uniform Memory Access (NUMA) Support" depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL + select ARCH_WANT_NUMA_VARIABLE_LOCALITY default n help Some SH systems have many various memories scattered around diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 65a872bf72f..97f8c5ad8c2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,6 +22,8 @@ config X86 def_bool y select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK + select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_WANTS_PROT_NUMA_PROT_NONE select HAVE_IDE select HAVE_OPROFILE select HAVE_PCSPKR_PLATFORM diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a1f780d45f7..5199db2923d 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -404,7 +404,14 @@ static inline int pte_same(pte_t a, pte_t b) static inline int pte_present(pte_t a) { - return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); + return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | + _PAGE_NUMA); +} + +#define pte_accessible pte_accessible +static inline int pte_accessible(pte_t a) +{ + return pte_flags(a) & _PAGE_PRESENT; } static inline int pte_hidden(pte_t pte) @@ -420,7 +427,8 @@ static inline int pmd_present(pmd_t pmd) * the _PAGE_PSE flag will remain set at all times while the * _PAGE_PRESENT bit is clear). */ - return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE | + _PAGE_NUMA); } static inline int pmd_none(pmd_t pmd) @@ -479,6 +487,11 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) static inline int pmd_bad(pmd_t pmd) { +#ifdef CONFIG_NUMA_BALANCING + /* pmd_numa check */ + if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA) + return 0; +#endif return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; } diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index ec8a1fc9505..3c32db8c539 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -64,6 +64,26 @@ #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +/* + * _PAGE_NUMA indicates that this page will trigger a numa hinting + * minor page fault to gather numa placement statistics (see + * pte_numa()). The bit picked (8) is within the range between + * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't + * require changes to the swp entry format because that bit is always + * zero when the pte is not present. + * + * The bit picked must be always zero when the pmd is present and not + * present, so that we don't lose information when we set it while + * atomically clearing the present bit. + * + * Because we shared the same bit (8) with _PAGE_PROTNONE this can be + * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE + * couldn't reach, like handle_mm_fault() (see access_error in + * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for + * handle_mm_fault() to be invoked). + */ +#define _PAGE_NUMA _PAGE_PROTNONE + #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 3a3e8c9e280..9a907a67be8 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -145,19 +145,6 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } -#ifdef CONFIG_SECCOMP -static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr) -{ - if (!seccomp_mode(&tsk->seccomp)) - return 0; - task_pt_regs(tsk)->orig_ax = syscall_nr; - task_pt_regs(tsk)->ax = syscall_nr; - return __secure_computing(syscall_nr); -} -#else -#define vsyscall_seccomp(_tsk, _nr) 0 -#endif - static bool write_ok_or_segv(unsigned long ptr, size_t size) { /* @@ -190,10 +177,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; - int vsyscall_nr; + int vsyscall_nr, syscall_nr, tmp; int prev_sig_on_uaccess_error; long ret; - int skip; /* * No point in checking CS -- the only way to get here is a user mode @@ -225,56 +211,84 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) } tsk = current; - /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. - */ - prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; - current_thread_info()->sig_on_uaccess_error = 1; /* + * Check for access_ok violations and find the syscall nr. + * * NULL is a valid user pointer (in the access_ok sense) on 32-bit and * 64-bit, so we don't need to special-case it here. For all the * vsyscalls, NULL means "don't write anything" not "write it at * address 0". */ - ret = -EFAULT; - skip = 0; switch (vsyscall_nr) { case 0: - skip = vsyscall_seccomp(tsk, __NR_gettimeofday); - if (skip) - break; - if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || - !write_ok_or_segv(regs->si, sizeof(struct timezone))) - break; + !write_ok_or_segv(regs->si, sizeof(struct timezone))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_gettimeofday; + break; + + case 1: + if (!write_ok_or_segv(regs->di, sizeof(time_t))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_time; + break; + + case 2: + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_getcpu; + break; + } + + /* + * Handle seccomp. regs->ip must be the original value. + * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. + * + * We could optimize the seccomp disabled case, but performance + * here doesn't matter. + */ + regs->orig_ax = syscall_nr; + regs->ax = -ENOSYS; + tmp = secure_computing(syscall_nr); + if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { + warn_bad_vsyscall(KERN_DEBUG, regs, + "seccomp tried to change syscall nr or ip"); + do_exit(SIGSYS); + } + if (tmp) + goto do_ret; /* skip requested */ + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; + current_thread_info()->sig_on_uaccess_error = 1; + + ret = -EFAULT; + switch (vsyscall_nr) { + case 0: ret = sys_gettimeofday( (struct timeval __user *)regs->di, (struct timezone __user *)regs->si); break; case 1: - skip = vsyscall_seccomp(tsk, __NR_time); - if (skip) - break; - - if (!write_ok_or_segv(regs->di, sizeof(time_t))) - break; - ret = sys_time((time_t __user *)regs->di); break; case 2: - skip = vsyscall_seccomp(tsk, __NR_getcpu); - if (skip) - break; - - if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || - !write_ok_or_segv(regs->si, sizeof(unsigned))) - break; - ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, NULL); @@ -283,12 +297,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; - if (skip) { - if ((long)regs->ax <= 0L) /* seccomp errno emulation */ - goto do_ret; - goto done; /* seccomp trace/trap */ - } - +check_fault: if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, @@ -311,7 +320,6 @@ do_ret: /* Emulate a ret instruction. */ regs->ip = caller; regs->sp += 8; -done: return true; sigsegv: diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 217eb705fac..e27fbf887f3 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -301,6 +301,13 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long)pgd); } +/* + * Used to set accessed or dirty bits in the page table entries + * on other architectures. On x86, the accessed and dirty bits + * are tracked by hardware. However, do_wp_page calls this function + * to also make the pte writeable at the same time the dirty bit is + * set. In that case we do actually need to write the PTE. + */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) @@ -310,7 +317,6 @@ int ptep_set_access_flags(struct vm_area_struct *vma, if (changed && dirty) { *ptep = entry; pte_update_defer(vma->vm_mm, address, ptep); - flush_tlb_page(vma, address); } return changed; diff --git a/drivers/bus/Kconfig b/drivers/bus/Kconfig index bbec35d21fe..0f51ed687dc 100644 --- a/drivers/bus/Kconfig +++ b/drivers/bus/Kconfig @@ -6,6 +6,7 @@ menu "Bus devices" config OMAP_OCP2SCP tristate "OMAP OCP2SCP DRIVER" + depends on ARCH_OMAP2PLUS help Driver to enable ocp2scp module which transforms ocp interface protocol to scp protocol. In OMAP4, USB PHY is connected via diff --git a/drivers/char/tpm/tpm_ibmvtpm.c b/drivers/char/tpm/tpm_ibmvtpm.c index 7da840d487d..9978609d93b 100644 --- a/drivers/char/tpm/tpm_ibmvtpm.c +++ b/drivers/char/tpm/tpm_ibmvtpm.c @@ -38,8 +38,6 @@ static struct vio_device_id tpm_ibmvtpm_device_table[] = { }; MODULE_DEVICE_TABLE(vio, tpm_ibmvtpm_device_table); -DECLARE_WAIT_QUEUE_HEAD(wq); - /** * ibmvtpm_send_crq - Send a CRQ request * @vdev: vio device struct @@ -83,6 +81,7 @@ static int tpm_ibmvtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count) { struct ibmvtpm_dev *ibmvtpm; u16 len; + int sig; ibmvtpm = (struct ibmvtpm_dev *)chip->vendor.data; @@ -91,22 +90,23 @@ static int tpm_ibmvtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count) return 0; } - wait_event_interruptible(wq, ibmvtpm->crq_res.len != 0); + sig = wait_event_interruptible(ibmvtpm->wq, ibmvtpm->res_len != 0); + if (sig) + return -EINTR; + + len = ibmvtpm->res_len; - if (count < ibmvtpm->crq_res.len) { + if (count < len) { dev_err(ibmvtpm->dev, "Invalid size in recv: count=%ld, crq_size=%d\n", - count, ibmvtpm->crq_res.len); + count, len); return -EIO; } spin_lock(&ibmvtpm->rtce_lock); - memcpy((void *)buf, (void *)ibmvtpm->rtce_buf, ibmvtpm->crq_res.len); - memset(ibmvtpm->rtce_buf, 0, ibmvtpm->crq_res.len); - ibmvtpm->crq_res.valid = 0; - ibmvtpm->crq_res.msg = 0; - len = ibmvtpm->crq_res.len; - ibmvtpm->crq_res.len = 0; + memcpy((void *)buf, (void *)ibmvtpm->rtce_buf, len); + memset(ibmvtpm->rtce_buf, 0, len); + ibmvtpm->res_len = 0; spin_unlock(&ibmvtpm->rtce_lock); return len; } @@ -273,7 +273,6 @@ static int tpm_ibmvtpm_remove(struct vio_dev *vdev) int rc = 0; free_irq(vdev->irq, ibmvtpm); - tasklet_kill(&ibmvtpm->tasklet); do { if (rc) @@ -372,7 +371,6 @@ static int ibmvtpm_reset_crq(struct ibmvtpm_dev *ibmvtpm) static int tpm_ibmvtpm_resume(struct device *dev) { struct ibmvtpm_dev *ibmvtpm = ibmvtpm_get_data(dev); - unsigned long flags; int rc = 0; do { @@ -387,10 +385,11 @@ static int tpm_ibmvtpm_resume(struct device *dev) return rc; } - spin_lock_irqsave(&ibmvtpm->lock, flags); - vio_disable_interrupts(ibmvtpm->vdev); - tasklet_schedule(&ibmvtpm->tasklet); - spin_unlock_irqrestore(&ibmvtpm->lock, flags); + rc = vio_enable_interrupts(ibmvtpm->vdev); + if (rc) { + dev_err(dev, "Error vio_enable_interrupts rc=%d\n", rc); + return rc; + } rc = ibmvtpm_crq_send_init(ibmvtpm); if (rc) @@ -467,7 +466,7 @@ static struct ibmvtpm_crq *ibmvtpm_crq_get_next(struct ibmvtpm_dev *ibmvtpm) if (crq->valid & VTPM_MSG_RES) { if (++crq_q->index == crq_q->num_entry) crq_q->index = 0; - rmb(); + smp_rmb(); } else crq = NULL; return crq; @@ -535,11 +534,9 @@ static void ibmvtpm_crq_process(struct ibmvtpm_crq *crq, ibmvtpm->vtpm_version = crq->data; return; case VTPM_TPM_COMMAND_RES: - ibmvtpm->crq_res.valid = crq->valid; - ibmvtpm->crq_res.msg = crq->msg; - ibmvtpm->crq_res.len = crq->len; - ibmvtpm->crq_res.data = crq->data; - wake_up_interruptible(&wq); + /* len of the data in rtce buffer */ + ibmvtpm->res_len = crq->len; + wake_up_interruptible(&ibmvtpm->wq); return; default: return; @@ -559,38 +556,19 @@ static void ibmvtpm_crq_process(struct ibmvtpm_crq *crq, static irqreturn_t ibmvtpm_interrupt(int irq, void *vtpm_instance) { struct ibmvtpm_dev *ibmvtpm = (struct ibmvtpm_dev *) vtpm_instance; - unsigned long flags; - - spin_lock_irqsave(&ibmvtpm->lock, flags); - vio_disable_interrupts(ibmvtpm->vdev); - tasklet_schedule(&ibmvtpm->tasklet); - spin_unlock_irqrestore(&ibmvtpm->lock, flags); - - return IRQ_HANDLED; -} - -/** - * ibmvtpm_tasklet - Interrupt handler tasklet - * @data: ibm vtpm device struct - * - * Returns: - * Nothing - **/ -static void ibmvtpm_tasklet(void *data) -{ - struct ibmvtpm_dev *ibmvtpm = data; struct ibmvtpm_crq *crq; - unsigned long flags; - spin_lock_irqsave(&ibmvtpm->lock, flags); + /* while loop is needed for initial setup (get version and + * get rtce_size). There should be only one tpm request at any + * given time. + */ while ((crq = ibmvtpm_crq_get_next(ibmvtpm)) != NULL) { ibmvtpm_crq_process(crq, ibmvtpm); crq->valid = 0; - wmb(); + smp_wmb(); } - vio_enable_interrupts(ibmvtpm->vdev); - spin_unlock_irqrestore(&ibmvtpm->lock, flags); + return IRQ_HANDLED; } /** @@ -650,9 +628,6 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev, goto reg_crq_cleanup; } - tasklet_init(&ibmvtpm->tasklet, (void *)ibmvtpm_tasklet, - (unsigned long)ibmvtpm); - rc = request_irq(vio_dev->irq, ibmvtpm_interrupt, 0, tpm_ibmvtpm_driver_name, ibmvtpm); if (rc) { @@ -666,13 +641,14 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev, goto init_irq_cleanup; } + init_waitqueue_head(&ibmvtpm->wq); + crq_q->index = 0; ibmvtpm->dev = dev; ibmvtpm->vdev = vio_dev; chip->vendor.data = (void *)ibmvtpm; - spin_lock_init(&ibmvtpm->lock); spin_lock_init(&ibmvtpm->rtce_lock); rc = ibmvtpm_crq_send_init(ibmvtpm); @@ -689,7 +665,6 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev, return rc; init_irq_cleanup: - tasklet_kill(&ibmvtpm->tasklet); do { rc1 = plpar_hcall_norets(H_FREE_CRQ, vio_dev->unit_address); } while (rc1 == H_BUSY || H_IS_LONG_BUSY(rc1)); diff --git a/drivers/char/tpm/tpm_ibmvtpm.h b/drivers/char/tpm/tpm_ibmvtpm.h index 4296eb4b4d8..bd82a791f99 100644 --- a/drivers/char/tpm/tpm_ibmvtpm.h +++ b/drivers/char/tpm/tpm_ibmvtpm.h @@ -38,13 +38,12 @@ struct ibmvtpm_dev { struct vio_dev *vdev; struct ibmvtpm_crq_queue crq_queue; dma_addr_t crq_dma_handle; - spinlock_t lock; - struct tasklet_struct tasklet; u32 rtce_size; void __iomem *rtce_buf; dma_addr_t rtce_dma_handle; spinlock_t rtce_lock; - struct ibmvtpm_crq crq_res; + wait_queue_head_t wq; + u16 res_len; u32 vtpm_version; }; diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index 77629d33f03..febead4bf8a 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -544,6 +544,7 @@ config KEYBOARD_OMAP config KEYBOARD_OMAP4 tristate "TI OMAP4+ keypad support" + depends on ARCH_OMAP2PLUS select INPUT_MATRIXKMAP help Say Y here if you want to use the OMAP4+ keypad. diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig index 7eb73c561bd..5de6e7f39f9 100644 --- a/drivers/usb/phy/Kconfig +++ b/drivers/usb/phy/Kconfig @@ -6,6 +6,7 @@ comment "USB Physical Layer drivers" config OMAP_USB2 tristate "OMAP USB2 PHY Driver" + depends on ARCH_OMAP2PLUS select USB_OTG_UTILS help Enable this to support the transceiver that is part of SOC. This diff --git a/drivers/video/omap2/Kconfig b/drivers/video/omap2/Kconfig index 346d67d6cf4..b07b2b042e7 100644 --- a/drivers/video/omap2/Kconfig +++ b/drivers/video/omap2/Kconfig @@ -1,6 +1,10 @@ config OMAP2_VRFB bool +if ARCH_OMAP2PLUS + source "drivers/video/omap2/dss/Kconfig" source "drivers/video/omap2/omapfb/Kconfig" source "drivers/video/omap2/displays/Kconfig" + +endif diff --git a/drivers/w1/masters/Kconfig b/drivers/w1/masters/Kconfig index c433a746e3f..e8ca63a82b9 100644 --- a/drivers/w1/masters/Kconfig +++ b/drivers/w1/masters/Kconfig @@ -60,6 +60,7 @@ config W1_MASTER_GPIO config HDQ_MASTER_OMAP tristate "OMAP HDQ driver" + depends on ARCH_OMAP help Say Y here if you want support for the 1-wire or HDQ Interface on an OMAP processor. diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 58db6df866e..af47e759446 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -338,9 +338,8 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, struct dma_attrs *attrs) { - phys_addr_t phys = page_to_phys(page) + offset; + phys_addr_t map, phys = page_to_phys(page) + offset; dma_addr_t dev_addr = xen_phys_to_bus(phys); - void *map; BUG_ON(dir == DMA_NONE); /* @@ -356,10 +355,10 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, * Oh well, have to allocate and map a bounce buffer. */ map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); - if (!map) + if (map == SWIOTLB_MAP_ERROR) return DMA_ERROR_CODE; - dev_addr = xen_virt_to_bus(map); + dev_addr = xen_phys_to_bus(map); /* * Ensure that the address returned is DMA'ble @@ -389,7 +388,7 @@ static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, /* NOTE: We use dev_addr here, not paddr! */ if (is_xen_swiotlb_buffer(dev_addr)) { - swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir); + swiotlb_tbl_unmap_single(hwdev, paddr, size, dir); return; } @@ -434,8 +433,7 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, /* NOTE: We use dev_addr here, not paddr! */ if (is_xen_swiotlb_buffer(dev_addr)) { - swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir, - target); + swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); return; } @@ -494,11 +492,12 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, if (swiotlb_force || !dma_capable(hwdev, dev_addr, sg->length) || range_straddles_page_boundary(paddr, sg->length)) { - void *map = swiotlb_tbl_map_single(hwdev, - start_dma_addr, - sg_phys(sg), - sg->length, dir); - if (!map) { + phys_addr_t map = swiotlb_tbl_map_single(hwdev, + start_dma_addr, + sg_phys(sg), + sg->length, + dir); + if (map == SWIOTLB_MAP_ERROR) { /* Don't panic here, we expect map_sg users to do proper error handling. */ xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, @@ -506,7 +505,7 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, sgl[0].dma_length = 0; return DMA_ERROR_CODE; } - sg->dma_address = xen_virt_to_bus(map); + sg->dma_address = xen_phys_to_bus(map); } else sg->dma_address = dev_addr; sg->dma_length = sg->length; diff --git a/fs/Kconfig b/fs/Kconfig index f95ae3a027f..eaff24a1950 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -28,8 +28,8 @@ config FS_MBCACHE tristate default y if EXT2_FS=y && EXT2_FS_XATTR default y if EXT3_FS=y && EXT3_FS_XATTR - default y if EXT4_FS=y && EXT4_FS_XATTR - default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR + default y if EXT4_FS=y + default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS source "fs/reiserfs/Kconfig" source "fs/jfs/Kconfig" diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 75c1ee69914..5cbd00e7406 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -346,19 +346,15 @@ init_cifs_idmap(void) if (!cred) return -ENOMEM; - keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA); + keyring = keyring_alloc(".cifs_idmap", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; } - ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); - if (ret < 0) - goto failed_put_key; - ret = register_key_type(&cifs_idmap_key_type); if (ret < 0) goto failed_put_key; diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index c22f17021b6..0a475c88185 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -39,22 +39,8 @@ config EXT4_USE_FOR_EXT23 compiled kernel size by using one file system driver for ext2, ext3, and ext4 file systems. -config EXT4_FS_XATTR - bool "Ext4 extended attributes" - depends on EXT4_FS - default y - help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page, or visit - <http://acl.bestbits.at/> for details). - - If unsure, say N. - - You need this for POSIX ACL support on ext4. - config EXT4_FS_POSIX_ACL bool "Ext4 POSIX Access Control Lists" - depends on EXT4_FS_XATTR select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support permissions for users and @@ -67,7 +53,6 @@ config EXT4_FS_POSIX_ACL config EXT4_FS_SECURITY bool "Ext4 Security Labels" - depends on EXT4_FS_XATTR help Security labels support alternative access control models implemented by security modules like SELinux. This option diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 56fd8f86593..0310fec2ee3 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -7,8 +7,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o indirect.o + mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ + xattr_trusted.o inline.o -ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index d3c5b88fd89..e6e0d988439 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -423,8 +423,10 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value, retry: handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto release_and_out; + } error = ext4_set_acl(handle, inode, type, acl); ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 8e07d2a5a13..b8d877f6c1f 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -27,23 +27,11 @@ #include <linux/slab.h> #include <linux/rbtree.h> #include "ext4.h" - -static unsigned char ext4_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; +#include "xattr.h" static int ext4_dx_readdir(struct file *filp, void *dirent, filldir_t filldir); -static unsigned char get_dtype(struct super_block *sb, int filetype) -{ - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || - (filetype >= EXT4_FT_MAX)) - return DT_UNKNOWN; - - return (ext4_filetype_table[filetype]); -} - /** * Check if the given dir-inode refers to an htree-indexed directory * (or a directory which chould potentially get coverted to use htree @@ -68,11 +56,14 @@ static int is_dx_dir(struct inode *inode) * Return 0 if the directory entry is OK, and 1 if there is a problem * * Note: this is the opposite of what ext2 and ext3 historically returned... + * + * bh passed here can be an inode block or a dir data block, depending + * on the inode inline data flag. */ int __ext4_check_dir_entry(const char *function, unsigned int line, struct inode *dir, struct file *filp, struct ext4_dir_entry_2 *de, - struct buffer_head *bh, + struct buffer_head *bh, char *buf, int size, unsigned int offset) { const char *error_msg = NULL; @@ -85,9 +76,8 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (unlikely(((char *) de - bh->b_data) + rlen > - dir->i_sb->s_blocksize)) - error_msg = "directory entry across blocks"; + else if (unlikely(((char *) de - buf) + rlen > size)) + error_msg = "directory entry across range"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; @@ -98,14 +88,14 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % bh->b_size), + error_msg, (unsigned) (offset % size), offset, le32_to_cpu(de->inode), rlen, de->name_len); else ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u(%u), " "inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned) (offset % bh->b_size), + error_msg, (unsigned) (offset % size), offset, le32_to_cpu(de->inode), rlen, de->name_len); @@ -125,6 +115,14 @@ static int ext4_readdir(struct file *filp, int ret = 0; int dir_has_error = 0; + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + ret = ext4_read_inline_dir(filp, dirent, filldir, + &has_inline_data); + if (has_inline_data) + return ret; + } + if (is_dx_dir(inode)) { err = ext4_dx_readdir(filp, dirent, filldir); if (err != ERR_BAD_DX_DIR) { @@ -221,8 +219,9 @@ revalidate: while (!error && filp->f_pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (ext4_check_dir_entry(inode, filp, de, - bh, offset)) { + if (ext4_check_dir_entry(inode, filp, de, bh, + bh->b_data, bh->b_size, + offset)) { /* * On error, skip the f_pos to the next block */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index df163da388c..8462eb3c33a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -57,6 +57,16 @@ #define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif +/* + * Turn on EXT_DEBUG to get lots of info about extents operations. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + #define EXT4_ERROR_INODE(inode, fmt, a...) \ ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) @@ -392,6 +402,7 @@ struct flex_groups { #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ @@ -448,28 +459,26 @@ enum { EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ + EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; -#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) -#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \ - printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \ - EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); } - -/* - * Since it's pretty easy to mix up bit numbers and hex values, and we - * can't do a compile-time test for ENUM values, we use a run-time - * test to make sure that EXT4_XXX_FL is consistent with respect to - * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop - * out so it won't cost any extra space in the compiled kernel image. - * But it's important that these values are the same, since we are - * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL - * must be consistent with the values of FS_XXX_FL defined in - * include/linux/fs.h and the on-disk values found in ext2, ext3, and - * ext4 filesystems, and of course the values defined in e2fsprogs. +/* + * Since it's pretty easy to mix up bit numbers and hex values, we use a + * build-time check to make sure that EXT4_XXX_FL is consistent with respect to + * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost + * any extra space in the compiled kernel image, otherwise, the build will fail. + * It's important that these values are the same, since we are using + * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent + * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk + * values found in ext2, ext3 and ext4 filesystems, and of course the values + * defined in e2fsprogs. * * It's not paranoia if the Murphy's Law really *is* out to get you. :-) */ +#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) +#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) + static inline void ext4_check_flag_values(void) { CHECK_FLAG_VALUE(SECRM); @@ -494,6 +503,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(EXTENTS); CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(EOFBLOCKS); + CHECK_FLAG_VALUE(INLINE_DATA); CHECK_FLAG_VALUE(RESERVED); } @@ -811,6 +821,8 @@ struct ext4_ext_cache { __u32 ec_len; /* must be 32bit to return holes */ }; +#include "extents_status.h" + /* * fourth extended file system inode data in memory */ @@ -833,7 +845,6 @@ struct ext4_inode_info { #endif unsigned long i_flags; -#ifdef CONFIG_EXT4_FS_XATTR /* * Extended attributes can be read independently of the main file * data. Taking i_mutex even when reading would cause contention @@ -842,7 +853,6 @@ struct ext4_inode_info { * EAs. */ struct rw_semaphore xattr_sem; -#endif struct list_head i_orphan; /* unlinked but open inodes */ @@ -888,6 +898,10 @@ struct ext4_inode_info { struct list_head i_prealloc_list; spinlock_t i_prealloc_lock; + /* extents status tree */ + struct ext4_es_tree i_es_tree; + rwlock_t i_es_lock; + /* ialloc */ ext4_group_t i_last_alloc_group; @@ -902,6 +916,10 @@ struct ext4_inode_info { /* on-disk additional length */ __u16 i_extra_isize; + /* Indicate the inline data space. */ + u16 i_inline_off; + u16 i_inline_size; + #ifdef CONFIG_QUOTA /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; @@ -1360,6 +1378,7 @@ enum { EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read nolocking */ + EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1481,7 +1500,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ #define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ -#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -1505,7 +1524,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ - EXT4_FEATURE_INCOMPAT_MMP) + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -1592,6 +1612,11 @@ struct ext4_dir_entry_tail { __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ }; +#define EXT4_DIRENT_TAIL(block, blocksize) \ + ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ + ((blocksize) - \ + sizeof(struct ext4_dir_entry_tail)))) + /* * Ext4 directory file types. Only the low 3 bits are used. The * other bits are reserved for now. @@ -1936,14 +1961,42 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct file *, struct ext4_dir_entry_2 *, - struct buffer_head *, unsigned int); -#define ext4_check_dir_entry(dir, filp, de, bh, offset) \ + struct buffer_head *, char *, int, + unsigned int); +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ - (de), (bh), (offset))) + (de), (bh), (buf), (size), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); extern void ext4_htree_free_dir_info(struct dir_private_info *p); +extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + const char *name, int namelen, + struct ext4_dir_entry_2 **dest_de); +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + const char *name, int namelen); +static inline void ext4_update_dx_flag(struct inode *inode) +{ + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); +} +static unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static inline unsigned char get_dtype(struct super_block *sb, int filetype) +{ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || + (filetype >= EXT4_FT_MAX)) + return DT_UNKNOWN; + + return ext4_filetype_table[filetype]; +} /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); @@ -1994,8 +2047,23 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int, int *); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int, int *); +int ext4_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create); +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)); +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh); +#define FALL_BACK_TO_NONDELALLOC 1 +#define CONVERT_INLINE_DATA 2 extern struct inode *ext4_iget(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, struct writeback_control *); @@ -2050,6 +2118,20 @@ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); +extern int search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); +extern int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size); /* resize.c */ extern int ext4_group_add(struct super_block *sb, @@ -2376,6 +2458,15 @@ extern void ext4_unwritten_wait(struct inode *inode); extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; extern struct dentry *ext4_get_parent(struct dentry *child); +extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len); +extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize); +extern int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh); /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; @@ -2393,6 +2484,9 @@ extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int); /* extents.c */ +struct ext4_ext_path; +struct ext4_extent; + extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, @@ -2410,8 +2504,27 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + ext4_lblk_t lblocks); +extern int ext4_extent_tree_init(handle_t *, struct inode *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); +extern int ext4_can_extents_be_merged(struct inode *inode, + struct ext4_extent *ex1, + struct ext4_extent *ex2); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, + struct ext4_ext_path *, + struct ext4_extent *, int); +extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path *); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +extern int ext4_ext_check_inode(struct inode *inode); +extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); + + /* move_extent.c */ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, @@ -2445,14 +2558,10 @@ enum ext4_state_bits { * never, ever appear in a buffer_head's state * flag. See EXT4_MAP_FROM_CLUSTER to see where * this is used. */ - BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This - * flag is set when ext4_map_blocks is called on a - * delayed allocated block to get its real mapping. */ }; BUFFER_FNS(Uninit, uninit) TAS_BUFFER_FNS(Uninit, uninit) -BUFFER_FNS(Da_Mapped, da_mapped) /* * Add new method to test whether block and inode bitmaps are properly @@ -2503,6 +2612,4 @@ extern void ext4_resize_end(struct super_block *sb); #endif /* __KERNEL__ */ -#include "ext4_extents.h" - #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index cb1b2c91996..487fda12bc0 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -43,16 +43,6 @@ #define CHECK_BINSEARCH__ /* - * Turn on EXT_DEBUG to get lots of info about extents operations. - */ -#define EXT_DEBUG__ -#ifdef EXT_DEBUG -#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) -#else -#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) -#endif - -/* * If EXT_STATS is defined then stats numbers are collected. * These number will be displayed at umount time. */ @@ -144,20 +134,6 @@ struct ext4_ext_path { */ /* - * to be called by ext4_ext_walk_space() - * negative retcode - error - * positive retcode - signal for ext4_ext_walk_space(), see below - * callback must return valid extent (passed or newly created) - */ -typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t, - struct ext4_ext_cache *, - struct ext4_extent *, void *); - -#define EXT_CONTINUE 0 -#define EXT_BREAK 1 -#define EXT_REPEAT 2 - -/* * Maximum number of logical blocks in a file; ext4_extent's ee_block is * __le32. */ @@ -300,21 +276,5 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, 0xffff); } -extern int ext4_ext_calc_metadata_amount(struct inode *inode, - ext4_lblk_t lblocks); -extern int ext4_extent_tree_init(handle_t *, struct inode *); -extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, - int num, - struct ext4_ext_path *path); -extern int ext4_can_extents_be_merged(struct inode *inode, - struct ext4_extent *ex1, - struct ext4_extent *ex2); -extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); -extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path *); -extern void ext4_ext_drop_refs(struct ext4_ext_path *); -extern int ext4_ext_check_inode(struct inode *inode); -extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, - int search_hint_reverse); #endif /* _EXT4_EXTENTS */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 56d258c1830..7177f9b21cb 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -254,13 +254,6 @@ static inline void ext4_handle_sync(handle_t *handle) handle->h_sync = 1; } -static inline void ext4_handle_release_buffer(handle_t *handle, - struct buffer_head *bh) -{ - if (ext4_handle_valid(handle)) - jbd2_journal_release_buffer(handle, bh); -} - static inline int ext4_handle_is_aborted(handle_t *handle) { if (ext4_handle_valid(handle)) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7011ac96720..26af22832a8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -41,6 +41,8 @@ #include <asm/uaccess.h> #include <linux/fiemap.h> #include "ext4_jbd2.h" +#include "ext4_extents.h" +#include "xattr.h" #include <trace/events/ext4.h> @@ -109,6 +111,9 @@ static int ext4_split_extent_at(handle_t *handle, int split_flag, int flags); +static int ext4_find_delayed_extent(struct inode *inode, + struct ext4_ext_cache *newex); + static int ext4_ext_truncate_extend_restart(handle_t *handle, struct inode *inode, int needed) @@ -1959,27 +1964,33 @@ cleanup: return err; } -static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, - ext4_lblk_t num, ext_prepare_callback func, - void *cbdata) +static int ext4_fill_fiemap_extents(struct inode *inode, + ext4_lblk_t block, ext4_lblk_t num, + struct fiemap_extent_info *fieinfo) { struct ext4_ext_path *path = NULL; - struct ext4_ext_cache cbex; + struct ext4_ext_cache newex; struct ext4_extent *ex; - ext4_lblk_t next, start = 0, end = 0; + ext4_lblk_t next, next_del, start = 0, end = 0; ext4_lblk_t last = block + num; - int depth, exists, err = 0; - - BUG_ON(func == NULL); - BUG_ON(inode == NULL); + int exists, depth = 0, err = 0; + unsigned int flags = 0; + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; while (block < last && block != EXT_MAX_BLOCKS) { num = last - block; /* find extent for this block */ down_read(&EXT4_I(inode)->i_data_sem); + + if (path && ext_depth(inode) != depth) { + /* depth was changed. we have to realloc path */ + kfree(path); + path = NULL; + } + path = ext4_ext_find_extent(inode, block, path); - up_read(&EXT4_I(inode)->i_data_sem); if (IS_ERR(path)) { + up_read(&EXT4_I(inode)->i_data_sem); err = PTR_ERR(path); path = NULL; break; @@ -1987,13 +1998,16 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, depth = ext_depth(inode); if (unlikely(path[depth].p_hdr == NULL)) { + up_read(&EXT4_I(inode)->i_data_sem); EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); err = -EIO; break; } ex = path[depth].p_ext; next = ext4_ext_next_allocated_block(path); + ext4_ext_drop_refs(path); + flags = 0; exists = 0; if (!ex) { /* there is no extent yet, so try to allocate @@ -2030,40 +2044,64 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, BUG_ON(end <= start); if (!exists) { - cbex.ec_block = start; - cbex.ec_len = end - start; - cbex.ec_start = 0; + newex.ec_block = start; + newex.ec_len = end - start; + newex.ec_start = 0; } else { - cbex.ec_block = le32_to_cpu(ex->ee_block); - cbex.ec_len = ext4_ext_get_actual_len(ex); - cbex.ec_start = ext4_ext_pblock(ex); + newex.ec_block = le32_to_cpu(ex->ee_block); + newex.ec_len = ext4_ext_get_actual_len(ex); + newex.ec_start = ext4_ext_pblock(ex); + if (ext4_ext_is_uninitialized(ex)) + flags |= FIEMAP_EXTENT_UNWRITTEN; } - if (unlikely(cbex.ec_len == 0)) { - EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); - err = -EIO; - break; + /* + * Find delayed extent and update newex accordingly. We call + * it even in !exists case to find out whether newex is the + * last existing extent or not. + */ + next_del = ext4_find_delayed_extent(inode, &newex); + if (!exists && next_del) { + exists = 1; + flags |= FIEMAP_EXTENT_DELALLOC; } - err = func(inode, next, &cbex, ex, cbdata); - ext4_ext_drop_refs(path); + up_read(&EXT4_I(inode)->i_data_sem); - if (err < 0) + if (unlikely(newex.ec_len == 0)) { + EXT4_ERROR_INODE(inode, "newex.ec_len == 0"); + err = -EIO; break; + } - if (err == EXT_REPEAT) - continue; - else if (err == EXT_BREAK) { - err = 0; - break; + /* This is possible iff next == next_del == EXT_MAX_BLOCKS */ + if (next == next_del) { + flags |= FIEMAP_EXTENT_LAST; + if (unlikely(next_del != EXT_MAX_BLOCKS || + next != EXT_MAX_BLOCKS)) { + EXT4_ERROR_INODE(inode, + "next extent == %u, next " + "delalloc extent = %u", + next, next_del); + err = -EIO; + break; + } } - if (ext_depth(inode) != depth) { - /* depth was changed. we have to realloc path */ - kfree(path); - path = NULL; + if (exists) { + err = fiemap_fill_next_extent(fieinfo, + (__u64)newex.ec_block << blksize_bits, + (__u64)newex.ec_start << blksize_bits, + (__u64)newex.ec_len << blksize_bits, + flags); + if (err < 0) + break; + if (err == 1) { + err = 0; + break; + } } - block = cbex.ec_block + cbex.ec_len; + block = newex.ec_block + newex.ec_len; } if (path) { @@ -2156,7 +2194,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, struct ext4_extent *ex) { struct ext4_ext_cache *cex; - struct ext4_sb_info *sbi; int ret = 0; /* @@ -2164,7 +2201,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); cex = &EXT4_I(inode)->i_cached_extent; - sbi = EXT4_SB(inode->i_sb); /* has cache valid data? */ if (cex->ec_len == 0) @@ -2273,7 +2309,13 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { int index; - int depth = ext_depth(inode); + int depth; + + /* If we are converting the inline data, only one is needed here. */ + if (ext4_has_inline_data(inode)) + return 1; + + depth = ext_depth(inode); if (chunk) index = depth * 2; @@ -3461,115 +3503,34 @@ out: /** * ext4_find_delalloc_range: find delayed allocated block in the given range. * - * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns - * whether there are any buffers marked for delayed allocation. It returns '1' - * on the first delalloc'ed buffer head found. If no buffer head in the given - * range is marked for delalloc, it returns 0. - * lblk_start should always be <= lblk_end. - * search_hint_reverse is to indicate that searching in reverse from lblk_end to - * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed - * block sooner). This is useful when blocks are truncated sequentially from - * lblk_start towards lblk_end. + * Return 1 if there is a delalloc block in the range, otherwise 0. */ static int ext4_find_delalloc_range(struct inode *inode, ext4_lblk_t lblk_start, - ext4_lblk_t lblk_end, - int search_hint_reverse) + ext4_lblk_t lblk_end) { - struct address_space *mapping = inode->i_mapping; - struct buffer_head *head, *bh = NULL; - struct page *page; - ext4_lblk_t i, pg_lblk; - pgoff_t index; - - if (!test_opt(inode->i_sb, DELALLOC)) - return 0; - - /* reverse search wont work if fs block size is less than page size */ - if (inode->i_blkbits < PAGE_CACHE_SHIFT) - search_hint_reverse = 0; + struct extent_status es; - if (search_hint_reverse) - i = lblk_end; + es.start = lblk_start; + ext4_es_find_extent(inode, &es); + if (es.len == 0) + return 0; /* there is no delay extent in this tree */ + else if (es.start <= lblk_start && lblk_start < es.start + es.len) + return 1; + else if (lblk_start <= es.start && es.start <= lblk_end) + return 1; else - i = lblk_start; - - index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - - while ((i >= lblk_start) && (i <= lblk_end)) { - page = find_get_page(mapping, index); - if (!page) - goto nextpage; - - if (!page_has_buffers(page)) - goto nextpage; - - head = page_buffers(page); - if (!head) - goto nextpage; - - bh = head; - pg_lblk = index << (PAGE_CACHE_SHIFT - - inode->i_blkbits); - do { - if (unlikely(pg_lblk < lblk_start)) { - /* - * This is possible when fs block size is less - * than page size and our cluster starts/ends in - * middle of the page. So we need to skip the - * initial few blocks till we reach the 'lblk' - */ - pg_lblk++; - continue; - } - - /* Check if the buffer is delayed allocated and that it - * is not yet mapped. (when da-buffers are mapped during - * their writeout, their da_mapped bit is set.) - */ - if (buffer_delay(bh) && !buffer_da_mapped(bh)) { - page_cache_release(page); - trace_ext4_find_delalloc_range(inode, - lblk_start, lblk_end, - search_hint_reverse, - 1, i); - return 1; - } - if (search_hint_reverse) - i--; - else - i++; - } while ((i >= lblk_start) && (i <= lblk_end) && - ((bh = bh->b_this_page) != head)); -nextpage: - if (page) - page_cache_release(page); - /* - * Move to next page. 'i' will be the first lblk in the next - * page. - */ - if (search_hint_reverse) - index--; - else - index++; - i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - } - - trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end, - search_hint_reverse, 0, 0); - return 0; + return 0; } -int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, - int search_hint_reverse) +int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t lblk_start, lblk_end; lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); lblk_end = lblk_start + sbi->s_cluster_ratio - 1; - return ext4_find_delalloc_range(inode, lblk_start, lblk_end, - search_hint_reverse); + return ext4_find_delalloc_range(inode, lblk_start, lblk_end); } /** @@ -3630,7 +3591,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); lblk_to = lblk_from + c_offset - 1; - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) allocated_clusters--; } @@ -3640,7 +3601,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, lblk_from = lblk_start + num_blks; lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; - if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) + if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) allocated_clusters--; } @@ -3663,8 +3624,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, flags, allocated); ext4_ext_show_leaf(inode, path); - trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated, - newblock); + trace_ext4_ext_handle_uninitialized_extents(inode, map, flags, + allocated, newblock); /* get_block() before submit the IO, split the extent */ if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { @@ -3911,7 +3872,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent newex, *ex, *ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0; - int free_on_err = 0, err = 0, depth, ret; + int free_on_err = 0, err = 0, depth; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; @@ -3927,7 +3888,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { if (!newex.ee_start_lo && !newex.ee_start_hi) { if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) + ext4_find_delalloc_cluster(inode, map->m_lblk)) map->m_flags |= EXT4_MAP_FROM_CLUSTER; if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { @@ -4007,15 +3968,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ee_len, ee_start); goto out; } - ret = ext4_ext_handle_uninitialized_extents( + allocated = ext4_ext_handle_uninitialized_extents( handle, inode, map, path, flags, allocated, newblock); - return ret; + goto out3; } } if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) + ext4_find_delalloc_cluster(inode, map->m_lblk)) map->m_flags |= EXT4_MAP_FROM_CLUSTER; /* @@ -4284,8 +4245,8 @@ out2: kfree(path); } - trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, - newblock, map->m_len, err ? err : allocated); +out3: + trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); return err ? err : allocated; } @@ -4344,6 +4305,8 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); + err = ext4_es_remove_extent(inode, last_block, + EXT_MAX_BLOCKS - last_block); err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); /* In a multi-transaction truncate, we only make the final @@ -4434,6 +4397,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_PUNCH_HOLE) return ext4_punch_hole(file, offset, len); + ret = ext4_convert_inline_data(inode); + if (ret) + return ret; + trace_ext4_fallocate_enter(inode, offset, len, mode); map.m_lblk = offset >> blkbits; /* @@ -4572,206 +4539,43 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, } /* - * Callback function called for each extent to gather FIEMAP information. + * If newex is not existing extent (newex->ec_start equals zero) find + * delayed extent at start of newex and update newex accordingly and + * return start of the next delayed extent. + * + * If newex is existing extent (newex->ec_start is not equal zero) + * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed + * extent found. Leave newex unmodified. */ -static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, - struct ext4_ext_cache *newex, struct ext4_extent *ex, - void *data) +static int ext4_find_delayed_extent(struct inode *inode, + struct ext4_ext_cache *newex) { - __u64 logical; - __u64 physical; - __u64 length; - __u32 flags = 0; - int ret = 0; - struct fiemap_extent_info *fieinfo = data; - unsigned char blksize_bits; + struct extent_status es; + ext4_lblk_t next_del; - blksize_bits = inode->i_sb->s_blocksize_bits; - logical = (__u64)newex->ec_block << blksize_bits; + es.start = newex->ec_block; + next_del = ext4_es_find_extent(inode, &es); if (newex->ec_start == 0) { /* * No extent in extent-tree contains block @newex->ec_start, * then the block may stay in 1)a hole or 2)delayed-extent. - * - * Holes or delayed-extents are processed as follows. - * 1. lookup dirty pages with specified range in pagecache. - * If no page is got, then there is no delayed-extent and - * return with EXT_CONTINUE. - * 2. find the 1st mapped buffer, - * 3. check if the mapped buffer is both in the request range - * and a delayed buffer. If not, there is no delayed-extent, - * then return. - * 4. a delayed-extent is found, the extent will be collected. */ - ext4_lblk_t end = 0; - pgoff_t last_offset; - pgoff_t offset; - pgoff_t index; - pgoff_t start_index = 0; - struct page **pages = NULL; - struct buffer_head *bh = NULL; - struct buffer_head *head = NULL; - unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *); - - pages = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (pages == NULL) - return -ENOMEM; - - offset = logical >> PAGE_SHIFT; -repeat: - last_offset = offset; - head = NULL; - ret = find_get_pages_tag(inode->i_mapping, &offset, - PAGECACHE_TAG_DIRTY, nr_pages, pages); - - if (!(flags & FIEMAP_EXTENT_DELALLOC)) { - /* First time, try to find a mapped buffer. */ - if (ret == 0) { -out: - for (index = 0; index < ret; index++) - page_cache_release(pages[index]); - /* just a hole. */ - kfree(pages); - return EXT_CONTINUE; - } - index = 0; - -next_page: - /* Try to find the 1st mapped buffer. */ - end = ((__u64)pages[index]->index << PAGE_SHIFT) >> - blksize_bits; - if (!page_has_buffers(pages[index])) - goto out; - head = page_buffers(pages[index]); - if (!head) - goto out; - - index++; - bh = head; - do { - if (end >= newex->ec_block + - newex->ec_len) - /* The buffer is out of - * the request range. - */ - goto out; - - if (buffer_mapped(bh) && - end >= newex->ec_block) { - start_index = index - 1; - /* get the 1st mapped buffer. */ - goto found_mapped_buffer; - } - - bh = bh->b_this_page; - end++; - } while (bh != head); - - /* No mapped buffer in the range found in this page, - * We need to look up next page. - */ - if (index >= ret) { - /* There is no page left, but we need to limit - * newex->ec_len. - */ - newex->ec_len = end - newex->ec_block; - goto out; - } - goto next_page; - } else { - /*Find contiguous delayed buffers. */ - if (ret > 0 && pages[0]->index == last_offset) - head = page_buffers(pages[0]); - bh = head; - index = 1; - start_index = 0; - } - -found_mapped_buffer: - if (bh != NULL && buffer_delay(bh)) { - /* 1st or contiguous delayed buffer found. */ - if (!(flags & FIEMAP_EXTENT_DELALLOC)) { - /* - * 1st delayed buffer found, record - * the start of extent. - */ - flags |= FIEMAP_EXTENT_DELALLOC; - newex->ec_block = end; - logical = (__u64)end << blksize_bits; - } - /* Find contiguous delayed buffers. */ - do { - if (!buffer_delay(bh)) - goto found_delayed_extent; - bh = bh->b_this_page; - end++; - } while (bh != head); - - for (; index < ret; index++) { - if (!page_has_buffers(pages[index])) { - bh = NULL; - break; - } - head = page_buffers(pages[index]); - if (!head) { - bh = NULL; - break; - } - - if (pages[index]->index != - pages[start_index]->index + index - - start_index) { - /* Blocks are not contiguous. */ - bh = NULL; - break; - } - bh = head; - do { - if (!buffer_delay(bh)) - /* Delayed-extent ends. */ - goto found_delayed_extent; - bh = bh->b_this_page; - end++; - } while (bh != head); - } - } else if (!(flags & FIEMAP_EXTENT_DELALLOC)) - /* a hole found. */ - goto out; + if (es.len == 0) + /* A hole found. */ + return 0; -found_delayed_extent: - newex->ec_len = min(end - newex->ec_block, - (ext4_lblk_t)EXT_INIT_MAX_LEN); - if (ret == nr_pages && bh != NULL && - newex->ec_len < EXT_INIT_MAX_LEN && - buffer_delay(bh)) { - /* Have not collected an extent and continue. */ - for (index = 0; index < ret; index++) - page_cache_release(pages[index]); - goto repeat; + if (es.start > newex->ec_block) { + /* A hole found. */ + newex->ec_len = min(es.start - newex->ec_block, + newex->ec_len); + return 0; } - for (index = 0; index < ret; index++) - page_cache_release(pages[index]); - kfree(pages); + newex->ec_len = es.start + es.len - newex->ec_block; } - physical = (__u64)newex->ec_start << blksize_bits; - length = (__u64)newex->ec_len << blksize_bits; - - if (ex && ext4_ext_is_uninitialized(ex)) - flags |= FIEMAP_EXTENT_UNWRITTEN; - - if (next == EXT_MAX_BLOCKS) - flags |= FIEMAP_EXTENT_LAST; - - ret = fiemap_fill_next_extent(fieinfo, logical, physical, - length, flags); - if (ret < 0) - return ret; - if (ret == 1) - return EXT_BREAK; - return EXT_CONTINUE; + return next_del; } /* fiemap flags we can handle specified here */ #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) @@ -4971,6 +4775,8 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) ext4_ext_invalidate_cache(inode); ext4_discard_preallocations(inode); + err = ext4_es_remove_extent(inode, first_block, + stop_block - first_block); err = ext4_ext_remove_space(inode, first_block, stop_block - 1); ext4_ext_invalidate_cache(inode); @@ -4991,12 +4797,22 @@ out_mutex: mutex_unlock(&inode->i_mutex); return err; } + int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { ext4_lblk_t start_blk; int error = 0; + if (ext4_has_inline_data(inode)) { + int has_inline = 1; + + error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); + + if (has_inline) + return error; + } + /* fallback to generic here if not in extents fmt */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return generic_block_fiemap(inode, fieinfo, start, len, @@ -5018,11 +4834,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; /* - * Walk the extent tree gathering extent information. - * ext4_ext_fiemap_cb will push extents back to user. + * Walk the extent tree gathering extent information + * and pushing extents back to the user. */ - error = ext4_ext_walk_space(inode, start_blk, len_blks, - ext4_ext_fiemap_cb, fieinfo); + error = ext4_fill_fiemap_extents(inode, start_blk, + len_blks, fieinfo); } return error; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c new file mode 100644 index 00000000000..564d981a2fc --- /dev/null +++ b/fs/ext4/extents_status.c @@ -0,0 +1,500 @@ +/* + * fs/ext4/extents_status.c + * + * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> + * Modified by + * Allison Henderson <achender@linux.vnet.ibm.com> + * Hugh Dickins <hughd@google.com> + * Zheng Liu <wenqing.lz@taobao.com> + * + * Ext4 extents status tree core functions. + */ +#include <linux/rbtree.h> +#include "ext4.h" +#include "extents_status.h" +#include "ext4_extents.h" + +#include <trace/events/ext4.h> + +/* + * According to previous discussion in Ext4 Developer Workshop, we + * will introduce a new structure called io tree to track all extent + * status in order to solve some problems that we have met + * (e.g. Reservation space warning), and provide extent-level locking. + * Delay extent tree is the first step to achieve this goal. It is + * original built by Yongqiang Yang. At that time it is called delay + * extent tree, whose goal is only track delay extent in memory to + * simplify the implementation of fiemap and bigalloc, and introduce + * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called + * delay extent tree at the following comment. But for better + * understand what it does, it has been rename to extent status tree. + * + * Currently the first step has been done. All delay extents are + * tracked in the tree. It maintains the delay extent when a delay + * allocation is issued, and the delay extent is written out or + * invalidated. Therefore the implementation of fiemap and bigalloc + * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. + * + * The following comment describes the implemenmtation of extent + * status tree and future works. + */ + +/* + * extents status tree implementation for ext4. + * + * + * ========================================================================== + * Extents status encompass delayed extents and extent locks + * + * 1. Why delayed extent implementation ? + * + * Without delayed extent, ext4 identifies a delayed extent by looking + * up page cache, this has several deficiencies - complicated, buggy, + * and inefficient code. + * + * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need + * to know if a block or a range of blocks are belonged to a delayed + * extent. + * + * Let us have a look at how they do without delayed extents implementation. + * -- FIEMAP + * FIEMAP looks up page cache to identify delayed allocations from holes. + * + * -- SEEK_HOLE/DATA + * SEEK_HOLE/DATA has the same problem as FIEMAP. + * + * -- bigalloc + * bigalloc looks up page cache to figure out if a block is + * already under delayed allocation or not to determine whether + * quota reserving is needed for the cluster. + * + * -- punch hole + * punch hole looks up page cache to identify a delayed extent. + * + * -- writeout + * Writeout looks up whole page cache to see if a buffer is + * mapped, If there are not very many delayed buffers, then it is + * time comsuming. + * + * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, + * bigalloc and writeout can figure out if a block or a range of + * blocks is under delayed allocation(belonged to a delayed extent) or + * not by searching the delayed extent tree. + * + * + * ========================================================================== + * 2. ext4 delayed extents impelmentation + * + * -- delayed extent + * A delayed extent is a range of blocks which are contiguous + * logically and under delayed allocation. Unlike extent in + * ext4, delayed extent in ext4 is a in-memory struct, there is + * no corresponding on-disk data. There is no limit on length of + * delayed extent, so a delayed extent can contain as many blocks + * as they are contiguous logically. + * + * -- delayed extent tree + * Every inode has a delayed extent tree and all under delayed + * allocation blocks are added to the tree as delayed extents. + * Delayed extents in the tree are ordered by logical block no. + * + * -- operations on a delayed extent tree + * There are three operations on a delayed extent tree: find next + * delayed extent, adding a space(a range of blocks) and removing + * a space. + * + * -- race on a delayed extent tree + * Delayed extent tree is protected inode->i_es_lock. + * + * + * ========================================================================== + * 3. performance analysis + * -- overhead + * 1. There is a cache extent for write access, so if writes are + * not very random, adding space operaions are in O(1) time. + * + * -- gain + * 2. Code is much simpler, more readable, more maintainable and + * more efficient. + * + * + * ========================================================================== + * 4. TODO list + * -- Track all extent status + * + * -- Improve get block process + * + * -- Extent-level locking + */ + +static struct kmem_cache *ext4_es_cachep; + +int __init ext4_init_es(void) +{ + ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); + if (ext4_es_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_es(void) +{ + if (ext4_es_cachep) + kmem_cache_destroy(ext4_es_cachep); +} + +void ext4_es_init_tree(struct ext4_es_tree *tree) +{ + tree->root = RB_ROOT; + tree->cache_es = NULL; +} + +#ifdef ES_DEBUG__ +static void ext4_es_print_tree(struct inode *inode) +{ + struct ext4_es_tree *tree; + struct rb_node *node; + + printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); + tree = &EXT4_I(inode)->i_es_tree; + node = rb_first(&tree->root); + while (node) { + struct extent_status *es; + es = rb_entry(node, struct extent_status, rb_node); + printk(KERN_DEBUG " [%u/%u)", es->start, es->len); + node = rb_next(node); + } + printk(KERN_DEBUG "\n"); +} +#else +#define ext4_es_print_tree(inode) +#endif + +static inline ext4_lblk_t extent_status_end(struct extent_status *es) +{ + BUG_ON(es->start + es->len < es->start); + return es->start + es->len - 1; +} + +/* + * search through the tree for an delayed extent with a given offset. If + * it can't be found, try to find next extent. + */ +static struct extent_status *__es_tree_search(struct rb_root *root, + ext4_lblk_t offset) +{ + struct rb_node *node = root->rb_node; + struct extent_status *es = NULL; + + while (node) { + es = rb_entry(node, struct extent_status, rb_node); + if (offset < es->start) + node = node->rb_left; + else if (offset > extent_status_end(es)) + node = node->rb_right; + else + return es; + } + + if (es && offset < es->start) + return es; + + if (es && offset > extent_status_end(es)) { + node = rb_next(&es->rb_node); + return node ? rb_entry(node, struct extent_status, rb_node) : + NULL; + } + + return NULL; +} + +/* + * ext4_es_find_extent: find the 1st delayed extent covering @es->start + * if it exists, otherwise, the next extent after @es->start. + * + * @inode: the inode which owns delayed extents + * @es: delayed extent that we found + * + * Returns the first block of the next extent after es, otherwise + * EXT_MAX_BLOCKS if no delay extent is found. + * Delayed extent is returned via @es. + */ +ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) +{ + struct ext4_es_tree *tree = NULL; + struct extent_status *es1 = NULL; + struct rb_node *node; + ext4_lblk_t ret = EXT_MAX_BLOCKS; + + trace_ext4_es_find_extent_enter(inode, es->start); + + read_lock(&EXT4_I(inode)->i_es_lock); + tree = &EXT4_I(inode)->i_es_tree; + + /* find delay extent in cache firstly */ + if (tree->cache_es) { + es1 = tree->cache_es; + if (in_range(es->start, es1->start, es1->len)) { + es_debug("%u cached by [%u/%u)\n", + es->start, es1->start, es1->len); + goto out; + } + } + + es->len = 0; + es1 = __es_tree_search(&tree->root, es->start); + +out: + if (es1) { + tree->cache_es = es1; + es->start = es1->start; + es->len = es1->len; + node = rb_next(&es1->rb_node); + if (node) { + es1 = rb_entry(node, struct extent_status, rb_node); + ret = es1->start; + } + } + + read_unlock(&EXT4_I(inode)->i_es_lock); + + trace_ext4_es_find_extent_exit(inode, es, ret); + return ret; +} + +static struct extent_status * +ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len) +{ + struct extent_status *es; + es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); + if (es == NULL) + return NULL; + es->start = start; + es->len = len; + return es; +} + +static void ext4_es_free_extent(struct extent_status *es) +{ + kmem_cache_free(ext4_es_cachep, es); +} + +static struct extent_status * +ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) +{ + struct extent_status *es1; + struct rb_node *node; + + node = rb_prev(&es->rb_node); + if (!node) + return es; + + es1 = rb_entry(node, struct extent_status, rb_node); + if (es->start == extent_status_end(es1) + 1) { + es1->len += es->len; + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(es); + es = es1; + } + + return es; +} + +static struct extent_status * +ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es) +{ + struct extent_status *es1; + struct rb_node *node; + + node = rb_next(&es->rb_node); + if (!node) + return es; + + es1 = rb_entry(node, struct extent_status, rb_node); + if (es1->start == extent_status_end(es) + 1) { + es->len += es1->len; + rb_erase(node, &tree->root); + ext4_es_free_extent(es1); + } + + return es; +} + +static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset, + ext4_lblk_t len) +{ + struct rb_node **p = &tree->root.rb_node; + struct rb_node *parent = NULL; + struct extent_status *es; + ext4_lblk_t end = offset + len - 1; + + BUG_ON(end < offset); + es = tree->cache_es; + if (es && offset == (extent_status_end(es) + 1)) { + es_debug("cached by [%u/%u)\n", es->start, es->len); + es->len += len; + es = ext4_es_try_to_merge_right(tree, es); + goto out; + } else if (es && es->start == end + 1) { + es_debug("cached by [%u/%u)\n", es->start, es->len); + es->start = offset; + es->len += len; + es = ext4_es_try_to_merge_left(tree, es); + goto out; + } else if (es && es->start <= offset && + end <= extent_status_end(es)) { + es_debug("cached by [%u/%u)\n", es->start, es->len); + goto out; + } + + while (*p) { + parent = *p; + es = rb_entry(parent, struct extent_status, rb_node); + + if (offset < es->start) { + if (es->start == end + 1) { + es->start = offset; + es->len += len; + es = ext4_es_try_to_merge_left(tree, es); + goto out; + } + p = &(*p)->rb_left; + } else if (offset > extent_status_end(es)) { + if (offset == extent_status_end(es) + 1) { + es->len += len; + es = ext4_es_try_to_merge_right(tree, es); + goto out; + } + p = &(*p)->rb_right; + } else { + if (extent_status_end(es) <= end) + es->len = offset - es->start + len; + goto out; + } + } + + es = ext4_es_alloc_extent(offset, len); + if (!es) + return -ENOMEM; + rb_link_node(&es->rb_node, parent, p); + rb_insert_color(&es->rb_node, &tree->root); + +out: + tree->cache_es = es; + return 0; +} + +/* + * ext4_es_insert_extent() adds a space to a delayed extent tree. + * Caller holds inode->i_es_lock. + * + * ext4_es_insert_extent is called by ext4_da_write_begin and + * ext4_es_remove_extent. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, + ext4_lblk_t len) +{ + struct ext4_es_tree *tree; + int err = 0; + + trace_ext4_es_insert_extent(inode, offset, len); + es_debug("add [%u/%u) to extent status tree of inode %lu\n", + offset, len, inode->i_ino); + + write_lock(&EXT4_I(inode)->i_es_lock); + tree = &EXT4_I(inode)->i_es_tree; + err = __es_insert_extent(tree, offset, len); + write_unlock(&EXT4_I(inode)->i_es_lock); + + ext4_es_print_tree(inode); + + return err; +} + +/* + * ext4_es_remove_extent() removes a space from a delayed extent tree. + * Caller holds inode->i_es_lock. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, + ext4_lblk_t len) +{ + struct rb_node *node; + struct ext4_es_tree *tree; + struct extent_status *es; + struct extent_status orig_es; + ext4_lblk_t len1, len2, end; + int err = 0; + + trace_ext4_es_remove_extent(inode, offset, len); + es_debug("remove [%u/%u) from extent status tree of inode %lu\n", + offset, len, inode->i_ino); + + end = offset + len - 1; + BUG_ON(end < offset); + write_lock(&EXT4_I(inode)->i_es_lock); + tree = &EXT4_I(inode)->i_es_tree; + es = __es_tree_search(&tree->root, offset); + if (!es) + goto out; + if (es->start > end) + goto out; + + /* Simply invalidate cache_es. */ + tree->cache_es = NULL; + + orig_es.start = es->start; + orig_es.len = es->len; + len1 = offset > es->start ? offset - es->start : 0; + len2 = extent_status_end(es) > end ? + extent_status_end(es) - end : 0; + if (len1 > 0) + es->len = len1; + if (len2 > 0) { + if (len1 > 0) { + err = __es_insert_extent(tree, end + 1, len2); + if (err) { + es->start = orig_es.start; + es->len = orig_es.len; + goto out; + } + } else { + es->start = end + 1; + es->len = len2; + } + goto out; + } + + if (len1 > 0) { + node = rb_next(&es->rb_node); + if (node) + es = rb_entry(node, struct extent_status, rb_node); + else + es = NULL; + } + + while (es && extent_status_end(es) <= end) { + node = rb_next(&es->rb_node); + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(es); + if (!node) { + es = NULL; + break; + } + es = rb_entry(node, struct extent_status, rb_node); + } + + if (es && es->start < end + 1) { + len1 = extent_status_end(es) - end; + es->start = end + 1; + es->len = len1; + } + +out: + write_unlock(&EXT4_I(inode)->i_es_lock); + ext4_es_print_tree(inode); + return err; +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h new file mode 100644 index 00000000000..077f82db092 --- /dev/null +++ b/fs/ext4/extents_status.h @@ -0,0 +1,45 @@ +/* + * fs/ext4/extents_status.h + * + * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> + * Modified by + * Allison Henderson <achender@linux.vnet.ibm.com> + * Zheng Liu <wenqing.lz@taobao.com> + * + */ + +#ifndef _EXT4_EXTENTS_STATUS_H +#define _EXT4_EXTENTS_STATUS_H + +/* + * Turn on ES_DEBUG__ to get lots of info about extent status operations. + */ +#ifdef ES_DEBUG__ +#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#else +#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#endif + +struct extent_status { + struct rb_node rb_node; + ext4_lblk_t start; /* first block extent covers */ + ext4_lblk_t len; /* length of extent in block */ +}; + +struct ext4_es_tree { + struct rb_root root; + struct extent_status *cache_es; /* recently accessed extent */ +}; + +extern int __init ext4_init_es(void); +extern void ext4_exit_es(void); +extern void ext4_es_init_tree(struct ext4_es_tree *tree); + +extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t len); +extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t len); +extern ext4_lblk_t ext4_es_find_extent(struct inode *inode, + struct extent_status *es); + +#endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index bf3966bccd3..b64a60bf105 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -24,6 +24,7 @@ #include <linux/mount.h> #include <linux/path.h> #include <linux/quotaops.h> +#include <linux/pagevec.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -286,6 +287,324 @@ static int ext4_file_open(struct inode * inode, struct file * filp) } /* + * Here we use ext4_map_blocks() to get a block mapping for a extent-based + * file rather than ext4_ext_walk_space() because we can introduce + * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same + * function. When extent status tree has been fully implemented, it will + * track all extent status for a file and we can directly use it to + * retrieve the offset for SEEK_DATA/SEEK_HOLE. + */ + +/* + * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to + * lookup page cache to check whether or not there has some data between + * [startoff, endoff] because, if this range contains an unwritten extent, + * we determine this extent as a data or a hole according to whether the + * page cache has data or not. + */ +static int ext4_find_unwritten_pgoff(struct inode *inode, + int origin, + struct ext4_map_blocks *map, + loff_t *offset) +{ + struct pagevec pvec; + unsigned int blkbits; + pgoff_t index; + pgoff_t end; + loff_t endoff; + loff_t startoff; + loff_t lastoff; + int found = 0; + + blkbits = inode->i_sb->s_blocksize_bits; + startoff = *offset; + lastoff = startoff; + endoff = (map->m_lblk + map->m_len) << blkbits; + + index = startoff >> PAGE_CACHE_SHIFT; + end = endoff >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + do { + int i, num; + unsigned long nr_pages; + + num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, + (pgoff_t)num); + if (nr_pages == 0) { + if (origin == SEEK_DATA) + break; + + BUG_ON(origin != SEEK_HOLE); + /* + * If this is the first time to go into the loop and + * offset is not beyond the end offset, it will be a + * hole at this offset + */ + if (lastoff == startoff || lastoff < endoff) + found = 1; + break; + } + + /* + * If this is the first time to go into the loop and + * offset is smaller than the first page offset, it will be a + * hole at this offset. + */ + if (lastoff == startoff && origin == SEEK_HOLE && + lastoff < page_offset(pvec.pages[0])) { + found = 1; + break; + } + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + /* + * If the current offset is not beyond the end of given + * range, it will be a hole. + */ + if (lastoff < endoff && origin == SEEK_HOLE && + page->index > end) { + found = 1; + *offset = lastoff; + goto out; + } + + lock_page(page); + + if (unlikely(page->mapping != inode->i_mapping)) { + unlock_page(page); + continue; + } + + if (!page_has_buffers(page)) { + unlock_page(page); + continue; + } + + if (page_has_buffers(page)) { + lastoff = page_offset(page); + bh = head = page_buffers(page); + do { + if (buffer_uptodate(bh) || + buffer_unwritten(bh)) { + if (origin == SEEK_DATA) + found = 1; + } else { + if (origin == SEEK_HOLE) + found = 1; + } + if (found) { + *offset = max_t(loff_t, + startoff, lastoff); + unlock_page(page); + goto out; + } + lastoff += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); + } + + lastoff = page_offset(page) + PAGE_SIZE; + unlock_page(page); + } + + /* + * The no. of pages is less than our desired, that would be a + * hole in there. + */ + if (nr_pages < num && origin == SEEK_HOLE) { + found = 1; + *offset = lastoff; + break; + } + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index <= end); + +out: + pagevec_release(&pvec); + return found; +} + +/* + * ext4_seek_data() retrieves the offset for SEEK_DATA. + */ +static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) +{ + struct inode *inode = file->f_mapping->host; + struct ext4_map_blocks map; + struct extent_status es; + ext4_lblk_t start, last, end; + loff_t dataoff, isize; + int blkbits; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + + blkbits = inode->i_sb->s_blocksize_bits; + start = offset >> blkbits; + last = start; + end = isize >> blkbits; + dataoff = offset; + + do { + map.m_lblk = last; + map.m_len = end - last + 1; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { + if (last != start) + dataoff = last << blkbits; + break; + } + + /* + * If there is a delay extent at this offset, + * it will be as a data. + */ + es.start = last; + (void)ext4_es_find_extent(inode, &es); + if (last >= es.start && + last < es.start + es.len) { + if (last != start) + dataoff = last << blkbits; + break; + } + + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int unwritten; + unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, + &map, &dataoff); + if (unwritten) + break; + } + + last++; + dataoff = last << blkbits; + } while (last <= end); + + mutex_unlock(&inode->i_mutex); + + if (dataoff > isize) + return -ENXIO; + + if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) + return -EINVAL; + if (dataoff > maxsize) + return -EINVAL; + + if (dataoff != file->f_pos) { + file->f_pos = dataoff; + file->f_version = 0; + } + + return dataoff; +} + +/* + * ext4_seek_hole() retrieves the offset for SEEK_HOLE. + */ +static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) +{ + struct inode *inode = file->f_mapping->host; + struct ext4_map_blocks map; + struct extent_status es; + ext4_lblk_t start, last, end; + loff_t holeoff, isize; + int blkbits; + int ret = 0; + + mutex_lock(&inode->i_mutex); + + isize = i_size_read(inode); + if (offset >= isize) { + mutex_unlock(&inode->i_mutex); + return -ENXIO; + } + + blkbits = inode->i_sb->s_blocksize_bits; + start = offset >> blkbits; + last = start; + end = isize >> blkbits; + holeoff = offset; + + do { + map.m_lblk = last; + map.m_len = end - last + 1; + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { + last += ret; + holeoff = last << blkbits; + continue; + } + + /* + * If there is a delay extent at this offset, + * we will skip this extent. + */ + es.start = last; + (void)ext4_es_find_extent(inode, &es); + if (last >= es.start && + last < es.start + es.len) { + last = es.start + es.len; + holeoff = last << blkbits; + continue; + } + + /* + * If there is a unwritten extent at this offset, + * it will be as a data or a hole according to page + * cache that has data or not. + */ + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int unwritten; + unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, + &map, &holeoff); + if (!unwritten) { + last += ret; + holeoff = last << blkbits; + continue; + } + } + + /* find a hole */ + break; + } while (last <= end); + + mutex_unlock(&inode->i_mutex); + + if (holeoff > isize) + holeoff = isize; + + if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) + return -EINVAL; + if (holeoff > maxsize) + return -EINVAL; + + if (holeoff != file->f_pos) { + file->f_pos = holeoff; + file->f_version = 0; + } + + return holeoff; +} + +/* * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values * by calling generic_file_llseek_size() with the appropriate maxbytes * value for each. @@ -300,8 +619,19 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) else maxbytes = inode->i_sb->s_maxbytes; - return generic_file_llseek_size(file, offset, origin, - maxbytes, i_size_read(inode)); + switch (origin) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + return generic_file_llseek_size(file, offset, origin, + maxbytes, i_size_read(inode)); + case SEEK_DATA: + return ext4_seek_data(file, offset, maxbytes); + case SEEK_HOLE: + return ext4_seek_hole(file, offset, maxbytes); + } + + return -EINVAL; } const struct file_operations ext4_file_operations = { @@ -326,12 +656,10 @@ const struct file_operations ext4_file_operations = { const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif .get_acl = ext4_get_acl, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index be1d89f385b..dfbc1fe9667 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -44,7 +44,6 @@ */ static int ext4_sync_parent(struct inode *inode) { - struct writeback_control wbc; struct dentry *dentry = NULL; struct inode *next; int ret = 0; @@ -66,10 +65,7 @@ static int ext4_sync_parent(struct inode *inode) ret = sync_mapping_buffers(inode->i_mapping); if (ret) break; - memset(&wbc, 0, sizeof(wbc)); - wbc.sync_mode = WB_SYNC_ALL; - wbc.nr_to_write = 0; /* only write out the inode */ - ret = sync_inode(inode, &wbc); + ret = sync_inode_metadata(inode, 1); if (ret) break; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 3a100e7a62a..3f32c801244 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -762,7 +762,6 @@ got: BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh); - brelse(block_bitmap_bh); /* recheck and clear flag under lock if we still need to */ ext4_lock_group(sb, group); @@ -775,6 +774,7 @@ got: ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); + brelse(block_bitmap_bh); if (err) goto fail; @@ -902,6 +902,10 @@ got: ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; + ei->i_inline_off = 0; + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + ret = inode; dquot_initialize(inode); err = dquot_alloc_inode(inode); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 792e388e7b4..20862f96e8a 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -22,6 +22,7 @@ #include "ext4_jbd2.h" #include "truncate.h" +#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */ #include <trace/events/ext4.h> @@ -755,8 +756,7 @@ cleanup: partial--; } out: - trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, - map->m_pblk, map->m_len, err); + trace_ext4_ind_map_blocks_exit(inode, map, err); return err; } @@ -1412,6 +1412,7 @@ void ext4_ind_truncate(struct inode *inode) down_write(&ei->i_data_sem); ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); /* * The orphan list entry will now protect us from any crash which diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c new file mode 100644 index 00000000000..387c47c6cda --- /dev/null +++ b/fs/ext4/inline.c @@ -0,0 +1,1884 @@ +/* + * Copyright (c) 2012 Taobao. + * Written by Tao Ma <boyu.mt@taobao.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include "ext4_jbd2.h" +#include "ext4.h" +#include "xattr.h" +#include "truncate.h" +#include <linux/fiemap.h> + +#define EXT4_XATTR_SYSTEM_DATA "data" +#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) +#define EXT4_INLINE_DOTDOT_SIZE 4 + +int ext4_get_inline_size(struct inode *inode) +{ + if (EXT4_I(inode)->i_inline_off) + return EXT4_I(inode)->i_inline_size; + + return 0; +} + +static int get_max_inline_xattr_value_size(struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_entry *entry; + struct ext4_inode *raw_inode; + int free, min_offs; + + min_offs = EXT4_SB(inode->i_sb)->s_inode_size - + EXT4_GOOD_OLD_INODE_SIZE - + EXT4_I(inode)->i_extra_isize - + sizeof(struct ext4_xattr_ibody_header); + + /* + * We need to subtract another sizeof(__u32) since an in-inode xattr + * needs an empty 4 bytes to indicate the gap between the xattr entry + * and the name/value pair. + */ + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return EXT4_XATTR_SIZE(min_offs - + EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) - + EXT4_XATTR_ROUND - sizeof(__u32)); + + raw_inode = ext4_raw_inode(iloc); + header = IHDR(inode, raw_inode); + entry = IFIRST(header); + + /* Compute min_offs. */ + for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_block && entry->e_value_size) { + size_t offs = le16_to_cpu(entry->e_value_offs); + if (offs < min_offs) + min_offs = offs; + } + } + free = min_offs - + ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32); + + if (EXT4_I(inode)->i_inline_off) { + entry = (struct ext4_xattr_entry *) + ((void *)raw_inode + EXT4_I(inode)->i_inline_off); + + free += le32_to_cpu(entry->e_value_size); + goto out; + } + + free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)); + + if (free > EXT4_XATTR_ROUND) + free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND); + else + free = 0; + +out: + return free; +} + +/* + * Get the maximum size we now can store in an inode. + * If we can't find the space for a xattr entry, don't use the space + * of the extents since we have no space to indicate the inline data. + */ +int ext4_get_max_inline_size(struct inode *inode) +{ + int error, max_inline_size; + struct ext4_iloc iloc; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) { + ext4_error_inode(inode, __func__, __LINE__, 0, + "can't get inode location %lu", + inode->i_ino); + return 0; + } + + down_read(&EXT4_I(inode)->xattr_sem); + max_inline_size = get_max_inline_xattr_value_size(inode, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + + brelse(iloc.bh); + + if (!max_inline_size) + return 0; + + return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; +} + +int ext4_has_inline_data(struct inode *inode) +{ + return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + EXT4_I(inode)->i_inline_off; +} + +/* + * this function does not take xattr_sem, which is OK because it is + * currently only used in a code path coming form ext4_iget, before + * the new inode has been unlocked + */ +int ext4_find_inline_data_nolock(struct inode *inode) +{ + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + if (!is.s.not_found) { + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + + le32_to_cpu(is.s.here->e_value_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + } +out: + brelse(is.iloc.bh); + return error; +} + +static int ext4_read_inline_data(struct inode *inode, void *buffer, + unsigned int len, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + int cp_len = 0; + struct ext4_inode *raw_inode; + + if (!len) + return 0; + + BUG_ON(len > EXT4_I(inode)->i_inline_size); + + cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ? + len : EXT4_MIN_INLINE_DATA_SIZE; + + raw_inode = ext4_raw_inode(iloc); + memcpy(buffer, (void *)(raw_inode->i_block), cp_len); + + len -= cp_len; + buffer += cp_len; + + if (!len) + goto out; + + header = IHDR(inode, raw_inode); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + len = min_t(unsigned int, len, + (unsigned int)le32_to_cpu(entry->e_value_size)); + + memcpy(buffer, + (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len); + cp_len += len; + +out: + return cp_len; +} + +/* + * write the buffer to the inline inode. + * If 'create' is set, we don't need to do the extra copy in the xattr + * value since it is already handled by ext4_xattr_ibody_inline_set. + * That saves us one memcpy. + */ +void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, + void *buffer, loff_t pos, unsigned int len) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + int cp_len = 0; + + BUG_ON(!EXT4_I(inode)->i_inline_off); + BUG_ON(pos + len > EXT4_I(inode)->i_inline_size); + + raw_inode = ext4_raw_inode(iloc); + buffer += pos; + + if (pos < EXT4_MIN_INLINE_DATA_SIZE) { + cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ? + EXT4_MIN_INLINE_DATA_SIZE - pos : len; + memcpy((void *)raw_inode->i_block + pos, buffer, cp_len); + + len -= cp_len; + buffer += cp_len; + pos += cp_len; + } + + if (!len) + return; + + pos -= EXT4_MIN_INLINE_DATA_SIZE; + header = IHDR(inode, raw_inode); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + + memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos, + buffer, len); +} + +static int ext4_create_inline_data(handle_t *handle, + struct inode *inode, unsigned len) +{ + int error; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto out; + + if (len > EXT4_MIN_INLINE_DATA_SIZE) { + value = EXT4_ZERO_XATTR_VALUE; + len -= EXT4_MIN_INLINE_DATA_SIZE; + } else { + value = ""; + len = 0; + } + + /* Insert the the xttr entry. */ + i.value = value; + i.value_len = len; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + BUG_ON(!is.s.not_found); + + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); + if (error) { + if (error == -ENOSPC) + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + goto out; + } + + memset((void *)ext4_raw_inode(&is.iloc)->i_block, + 0, EXT4_MIN_INLINE_DATA_SIZE); + + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; + ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + +out: + brelse(is.iloc.bh); + return error; +} + +static int ext4_update_inline_data(handle_t *handle, struct inode *inode, + unsigned int len) +{ + int error; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + /* If the old space is ok, write the data directly. */ + if (len <= EXT4_I(inode)->i_inline_size) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + BUG_ON(is.s.not_found); + + len -= EXT4_MIN_INLINE_DATA_SIZE; + value = kzalloc(len, GFP_NOFS); + if (!value) + goto out; + + error = ext4_xattr_ibody_get(inode, i.name_index, i.name, + value, len); + if (error == -ENODATA) + goto out; + + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto out; + + /* Update the xttr entry. */ + i.value = value; + i.value_len = len; + + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); + if (error) + goto out; + + EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - + (void *)ext4_raw_inode(&is.iloc)); + EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + + le32_to_cpu(is.s.here->e_value_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + +out: + kfree(value); + brelse(is.iloc.bh); + return error; +} + +int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, + unsigned int len) +{ + int ret, size; + struct ext4_inode_info *ei = EXT4_I(inode); + + if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + return -ENOSPC; + + size = ext4_get_max_inline_size(inode); + if (size < len) + return -ENOSPC; + + down_write(&EXT4_I(inode)->xattr_sem); + + if (ei->i_inline_off) + ret = ext4_update_inline_data(handle, inode, len); + else + ret = ext4_create_inline_data(handle, inode, len); + + up_write(&EXT4_I(inode)->xattr_sem); + + return ret; +} + +static int ext4_destroy_inline_data_nolock(handle_t *handle, + struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_xattr_ibody_find is = { + .s = { .not_found = 0, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + .value = NULL, + .value_len = 0, + }; + int error; + + if (!ei->i_inline_off) + return 0; + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) + return error; + + error = ext4_xattr_ibody_find(inode, &i, &is); + if (error) + goto out; + + error = ext4_journal_get_write_access(handle, is.iloc.bh); + if (error) + goto out; + + error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); + if (error) + goto out; + + memset((void *)ext4_raw_inode(&is.iloc)->i_block, + 0, EXT4_MIN_INLINE_DATA_SIZE); + + if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (S_ISDIR(inode->i_mode) || + S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { + ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode); + } + } + ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); + + get_bh(is.iloc.bh); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + + EXT4_I(inode)->i_inline_off = 0; + EXT4_I(inode)->i_inline_size = 0; + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); +out: + brelse(is.iloc.bh); + if (error == -ENODATA) + error = 0; + return error; +} + +static int ext4_read_inline_page(struct inode *inode, struct page *page) +{ + void *kaddr; + int ret = 0; + size_t len; + struct ext4_iloc iloc; + + BUG_ON(!PageLocked(page)); + BUG_ON(!ext4_has_inline_data(inode)); + BUG_ON(page->index); + + if (!EXT4_I(inode)->i_inline_off) { + ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", + inode->i_ino); + goto out; + } + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + goto out; + + len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); + kaddr = kmap_atomic(page); + ret = ext4_read_inline_data(inode, kaddr, len, &iloc); + flush_dcache_page(page); + kunmap_atomic(kaddr); + zero_user_segment(page, len, PAGE_CACHE_SIZE); + SetPageUptodate(page); + brelse(iloc.bh); + +out: + return ret; +} + +int ext4_readpage_inline(struct inode *inode, struct page *page) +{ + int ret = 0; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + return -EAGAIN; + } + + /* + * Current inline data can only exist in the 1st page, + * So for all the other pages, just set them uptodate. + */ + if (!page->index) + ret = ext4_read_inline_page(inode, page); + else if (!PageUptodate(page)) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + } + + up_read(&EXT4_I(inode)->xattr_sem); + + unlock_page(page); + return ret >= 0 ? 0 : ret; +} + +static int ext4_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + unsigned flags) +{ + int ret, needed_blocks; + handle_t *handle = NULL; + int retries = 0, sem_held = 0; + struct page *page = NULL; + unsigned from, to; + struct ext4_iloc iloc; + + if (!ext4_has_inline_data(inode)) { + /* + * clear the flag so that no new write + * will trap here again. + */ + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + return 0; + } + + needed_blocks = ext4_writepage_trans_blocks(inode); + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + +retry: + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) { + ret = -ENOMEM; + goto out; + } + + down_write(&EXT4_I(inode)->xattr_sem); + sem_held = 1; + /* If some one has already done this for us, just exit. */ + if (!ext4_has_inline_data(inode)) { + ret = 0; + goto out; + } + + from = 0; + to = ext4_get_inline_size(inode); + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out; + } + + ret = ext4_destroy_inline_data_nolock(handle, inode); + if (ret) + goto out; + + if (ext4_should_dioread_nolock(inode)) + ret = __block_write_begin(page, from, to, ext4_get_block_write); + else + ret = __block_write_begin(page, from, to, ext4_get_block); + + if (!ret && ext4_should_journal_data(inode)) { + ret = ext4_walk_page_buffers(handle, page_buffers(page), + from, to, NULL, + do_journal_get_write_access); + } + + if (ret) { + unlock_page(page); + page_cache_release(page); + ext4_orphan_add(handle, inode); + up_write(&EXT4_I(inode)->xattr_sem); + sem_held = 0; + ext4_journal_stop(handle); + handle = NULL; + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might + * still be on the orphan list; we need to + * make sure the inode is removed from the + * orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + + block_commit_write(page, from, to); +out: + if (page) { + unlock_page(page); + page_cache_release(page); + } + if (sem_held) + up_write(&EXT4_I(inode)->xattr_sem); + if (handle) + ext4_journal_stop(handle); + brelse(iloc.bh); + return ret; +} + +/* + * Try to write data in the inode. + * If the inode has inline data, check whether the new write can be + * in the inode also. If not, create the page the handle, move the data + * to the page make it update and let the later codes create extent for it. + */ +int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep) +{ + int ret; + handle_t *handle; + struct page *page; + struct ext4_iloc iloc; + + if (pos + len > ext4_get_max_inline_size(inode)) + goto convert; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + /* + * The possible write could happen in the inode, + * so try to reserve the space in inode first. + */ + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + ret = ext4_prepare_inline_data(handle, inode, pos + len); + if (ret && ret != -ENOSPC) + goto out; + + /* We don't have space in inline inode, so convert it to extent. */ + if (ret == -ENOSPC) { + ext4_journal_stop(handle); + brelse(iloc.bh); + goto convert; + } + + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) { + ret = -ENOMEM; + goto out; + } + + *pagep = page; + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ret = 0; + unlock_page(page); + page_cache_release(page); + goto out_up_read; + } + + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out_up_read; + } + + ret = 1; + handle = NULL; +out_up_read: + up_read(&EXT4_I(inode)->xattr_sem); +out: + if (handle) + ext4_journal_stop(handle); + brelse(iloc.bh); + return ret; +convert: + return ext4_convert_inline_data_to_extent(mapping, + inode, flags); +} + +int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct page *page) +{ + int ret; + void *kaddr; + struct ext4_iloc iloc; + + if (unlikely(copied < len)) { + if (!PageUptodate(page)) { + copied = 0; + goto out; + } + } + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + ext4_std_error(inode->i_sb, ret); + copied = 0; + goto out; + } + + down_write(&EXT4_I(inode)->xattr_sem); + BUG_ON(!ext4_has_inline_data(inode)); + + kaddr = kmap_atomic(page); + ext4_write_inline_data(inode, &iloc, kaddr, pos, len); + kunmap_atomic(kaddr); + SetPageUptodate(page); + /* clear page dirty so that writepages wouldn't work for us. */ + ClearPageDirty(page); + + up_write(&EXT4_I(inode)->xattr_sem); + brelse(iloc.bh); +out: + return copied; +} + +struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page) +{ + int ret; + void *kaddr; + struct ext4_iloc iloc; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + ext4_std_error(inode->i_sb, ret); + return NULL; + } + + down_write(&EXT4_I(inode)->xattr_sem); + kaddr = kmap_atomic(page); + ext4_write_inline_data(inode, &iloc, kaddr, 0, len); + kunmap_atomic(kaddr); + up_write(&EXT4_I(inode)->xattr_sem); + + return iloc.bh; +} + +/* + * Try to make the page cache and handle ready for the inline data case. + * We can call this function in 2 cases: + * 1. The inode is created and the first write exceeds inline size. We can + * clear the inode state safely. + * 2. The inode has inline data, then we need to read the data, make it + * update and dirty so that ext4_da_writepages can handle it. We don't + * need to start the journal since the file's metatdata isn't changed now. + */ +static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + unsigned flags, + void **fsdata) +{ + int ret = 0, inline_size; + struct page *page; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) + return -ENOMEM; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + goto out; + } + + inline_size = ext4_get_inline_size(inode); + + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out; + } + + ret = __block_write_begin(page, 0, inline_size, + ext4_da_get_block_prep); + if (ret) { + ext4_truncate_failed_write(inode); + goto out; + } + + SetPageDirty(page); + SetPageUptodate(page); + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + *fsdata = (void *)CONVERT_INLINE_DATA; + +out: + up_read(&EXT4_I(inode)->xattr_sem); + if (page) { + unlock_page(page); + page_cache_release(page); + } + return ret; +} + +/* + * Prepare the write for the inline data. + * If the the data can be written into the inode, we just read + * the page and make it uptodate, and start the journal. + * Otherwise read the page, makes it dirty so that it can be + * handle in writepages(the i_disksize update is left to the + * normal ext4_da_write_end). + */ +int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata) +{ + int ret, inline_size; + handle_t *handle; + struct page *page; + struct ext4_iloc iloc; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + inline_size = ext4_get_max_inline_size(inode); + + ret = -ENOSPC; + if (inline_size >= pos + len) { + ret = ext4_prepare_inline_data(handle, inode, pos + len); + if (ret && ret != -ENOSPC) + goto out; + } + + if (ret == -ENOSPC) { + ret = ext4_da_convert_inline_data_to_extent(mapping, + inode, + flags, + fsdata); + goto out; + } + + /* + * We cannot recurse into the filesystem as the transaction + * is already started. + */ + flags |= AOP_FLAG_NOFS; + + page = grab_cache_page_write_begin(mapping, 0, flags); + if (!page) { + ret = -ENOMEM; + goto out; + } + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + ret = 0; + goto out_release_page; + } + + if (!PageUptodate(page)) { + ret = ext4_read_inline_page(inode, page); + if (ret < 0) + goto out_release_page; + } + + up_read(&EXT4_I(inode)->xattr_sem); + *pagep = page; + handle = NULL; + brelse(iloc.bh); + return 1; +out_release_page: + up_read(&EXT4_I(inode)->xattr_sem); + unlock_page(page); + page_cache_release(page); +out: + if (handle) + ext4_journal_stop(handle); + brelse(iloc.bh); + return ret; +} + +int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page) +{ + int i_size_changed = 0; + + copied = ext4_write_inline_data_end(inode, pos, len, copied, page); + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + * + * But it's important to update i_size while still holding page lock: + * page writeout could otherwise come in and zero beyond i_size. + */ + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); + i_size_changed = 1; + } + unlock_page(page); + page_cache_release(page); + + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + mark_inode_dirty(inode); + + return copied; +} + +#ifdef INLINE_DIR_DEBUG +void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, + void *inline_start, int inline_size) +{ + int offset; + unsigned short de_len; + struct ext4_dir_entry_2 *de = inline_start; + void *dlimit = inline_start + inline_size; + + trace_printk("inode %lu\n", dir->i_ino); + offset = 0; + while ((void *)de < dlimit) { + de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); + trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n", + offset, de_len, de->name_len, de->name, + de->name_len, le32_to_cpu(de->inode)); + if (ext4_check_dir_entry(dir, NULL, de, bh, + inline_start, inline_size, offset)) + BUG(); + + offset += de_len; + de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); + } +} +#else +#define ext4_show_inline_dir(dir, bh, inline_start, inline_size) +#endif + +/* + * Add a new entry into a inline dir. + * It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. + */ +static int ext4_add_dirent_to_inline(handle_t *handle, + struct dentry *dentry, + struct inode *inode, + struct ext4_iloc *iloc, + void *inline_start, int inline_size) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned short reclen; + int err; + struct ext4_dir_entry_2 *de; + + reclen = EXT4_DIR_REC_LEN(namelen); + err = ext4_find_dest_de(dir, inode, iloc->bh, + inline_start, inline_size, + name, namelen, &de); + if (err) + return err; + + err = ext4_journal_get_write_access(handle, iloc->bh); + if (err) + return err; + ext4_insert_dentry(inode, de, inline_size, name, namelen); + + ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); + + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext4_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + ext4_update_dx_flag(dir); + dir->i_version++; + ext4_mark_inode_dirty(handle, dir); + return 1; +} + +static void *ext4_get_inline_xattr_pos(struct inode *inode, + struct ext4_iloc *iloc) +{ + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + + BUG_ON(!EXT4_I(inode)->i_inline_off); + + header = IHDR(inode, ext4_raw_inode(iloc)); + entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) + + EXT4_I(inode)->i_inline_off); + + return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs); +} + +/* Set the final de to cover the whole block. */ +static void ext4_update_final_de(void *de_buf, int old_size, int new_size) +{ + struct ext4_dir_entry_2 *de, *prev_de; + void *limit; + int de_len; + + de = (struct ext4_dir_entry_2 *)de_buf; + if (old_size) { + limit = de_buf + old_size; + do { + prev_de = de; + de_len = ext4_rec_len_from_disk(de->rec_len, old_size); + de_buf += de_len; + de = (struct ext4_dir_entry_2 *)de_buf; + } while (de_buf < limit); + + prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size - + old_size, new_size); + } else { + /* this is just created, so create an empty entry. */ + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk(new_size, new_size); + } +} + +static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, + struct ext4_iloc *iloc) +{ + int ret; + int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; + int new_size = get_max_inline_xattr_value_size(dir, iloc); + + if (new_size - old_size <= EXT4_DIR_REC_LEN(1)) + return -ENOSPC; + + ret = ext4_update_inline_data(handle, dir, + new_size + EXT4_MIN_INLINE_DATA_SIZE); + if (ret) + return ret; + + ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size, + EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE); + dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size; + return 0; +} + +static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc, + void *buf, int inline_size) +{ + ext4_create_inline_data(handle, inode, inline_size); + ext4_write_inline_data(inode, iloc, buf, 0, inline_size); + ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); +} + +static int ext4_finish_convert_inline_dir(handle_t *handle, + struct inode *inode, + struct buffer_head *dir_block, + void *buf, + int inline_size) +{ + int err, csum_size = 0, header_size = 0; + struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_tail *t; + void *target = dir_block->b_data; + + /* + * First create "." and ".." and then copy the dir information + * back to the block. + */ + de = (struct ext4_dir_entry_2 *)target; + de = ext4_init_dot_dotdot(inode, de, + inode->i_sb->s_blocksize, csum_size, + le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); + header_size = (void *)de - target; + + memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, + inline_size - EXT4_INLINE_DOTDOT_SIZE); + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + inode->i_size = inode->i_sb->s_blocksize; + i_size_write(inode, inode->i_sb->s_blocksize); + EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; + ext4_update_final_de(dir_block->b_data, + inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, + inode->i_sb->s_blocksize - csum_size); + + if (csum_size) { + t = EXT4_DIRENT_TAIL(dir_block->b_data, + inode->i_sb->s_blocksize); + initialize_dirent_tail(t, inode->i_sb->s_blocksize); + } + set_buffer_uptodate(dir_block); + err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); + if (err) + goto out; + set_buffer_verified(dir_block); +out: + return err; +} + +static int ext4_convert_inline_data_nolock(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc) +{ + int error; + void *buf = NULL; + struct buffer_head *data_bh = NULL; + struct ext4_map_blocks map; + int inline_size; + + inline_size = ext4_get_inline_size(inode); + buf = kmalloc(inline_size, GFP_NOFS); + if (!buf) { + error = -ENOMEM; + goto out; + } + + error = ext4_read_inline_data(inode, buf, inline_size, iloc); + if (error < 0) + goto out; + + error = ext4_destroy_inline_data_nolock(handle, inode); + if (error) + goto out; + + map.m_lblk = 0; + map.m_len = 1; + map.m_flags = 0; + error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE); + if (error < 0) + goto out_restore; + if (!(map.m_flags & EXT4_MAP_MAPPED)) { + error = -EIO; + goto out_restore; + } + + data_bh = sb_getblk(inode->i_sb, map.m_pblk); + if (!data_bh) { + error = -EIO; + goto out_restore; + } + + lock_buffer(data_bh); + error = ext4_journal_get_create_access(handle, data_bh); + if (error) { + unlock_buffer(data_bh); + error = -EIO; + goto out_restore; + } + memset(data_bh->b_data, 0, inode->i_sb->s_blocksize); + + if (!S_ISDIR(inode->i_mode)) { + memcpy(data_bh->b_data, buf, inline_size); + set_buffer_uptodate(data_bh); + error = ext4_handle_dirty_metadata(handle, + inode, data_bh); + } else { + error = ext4_finish_convert_inline_dir(handle, inode, data_bh, + buf, inline_size); + } + + unlock_buffer(data_bh); +out_restore: + if (error) + ext4_restore_inline_data(handle, inode, iloc, buf, inline_size); + +out: + brelse(data_bh); + kfree(buf); + return error; +} + +/* + * Try to add the new entry to the inline data. + * If succeeds, return 0. If not, extended the inline dir and copied data to + * the new created block. + */ +int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) +{ + int ret, inline_size; + void *inline_start; + struct ext4_iloc iloc; + struct inode *dir = dentry->d_parent->d_inode; + + ret = ext4_get_inode_loc(dir, &iloc); + if (ret) + return ret; + + down_write(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) + goto out; + + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; + + ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, + inline_start, inline_size); + if (ret != -ENOSPC) + goto out; + + /* check whether it can be inserted to inline xattr space. */ + inline_size = EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE; + if (!inline_size) { + /* Try to use the xattr space.*/ + ret = ext4_update_inline_dir(handle, dir, &iloc); + if (ret && ret != -ENOSPC) + goto out; + + inline_size = EXT4_I(dir)->i_inline_size - + EXT4_MIN_INLINE_DATA_SIZE; + } + + if (inline_size) { + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + + ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, + inline_start, inline_size); + + if (ret != -ENOSPC) + goto out; + } + + /* + * The inline space is filled up, so create a new block for it. + * As the extent tree will be created, we have to save the inline + * dir first. + */ + ret = ext4_convert_inline_data_nolock(handle, dir, &iloc); + +out: + ext4_mark_inode_dirty(handle, dir); + up_write(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + return ret; +} + +int ext4_read_inline_dir(struct file *filp, + void *dirent, filldir_t filldir, + int *has_inline_data) +{ + int error = 0; + unsigned int offset, parent_ino; + int i, stored; + struct ext4_dir_entry_2 *de; + struct super_block *sb; + struct inode *inode = filp->f_path.dentry->d_inode; + int ret, inline_size = 0; + struct ext4_iloc iloc; + void *dir_buf = NULL; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_read(&EXT4_I(inode)->xattr_sem); + *has_inline_data = 0; + goto out; + } + + inline_size = ext4_get_inline_size(inode); + dir_buf = kmalloc(inline_size, GFP_NOFS); + if (!dir_buf) { + ret = -ENOMEM; + up_read(&EXT4_I(inode)->xattr_sem); + goto out; + } + + ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc); + up_read(&EXT4_I(inode)->xattr_sem); + if (ret < 0) + goto out; + + sb = inode->i_sb; + stored = 0; + parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); + + while (!error && !stored && filp->f_pos < inode->i_size) { +revalidate: + /* + * If the version has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the inline + * dir to make sure. + */ + if (filp->f_version != inode->i_version) { + for (i = 0; + i < inode->i_size && i < offset;) { + if (!i) { + /* skip "." and ".." if needed. */ + i += EXT4_INLINE_DOTDOT_SIZE; + continue; + } + de = (struct ext4_dir_entry_2 *) + (dir_buf + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, + inline_size) < EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + inline_size); + } + offset = i; + filp->f_pos = offset; + filp->f_version = inode->i_version; + } + + while (!error && filp->f_pos < inode->i_size) { + if (filp->f_pos == 0) { + error = filldir(dirent, ".", 1, 0, inode->i_ino, + DT_DIR); + if (error) + break; + stored++; + + error = filldir(dirent, "..", 2, 0, parent_ino, + DT_DIR); + if (error) + break; + stored++; + + filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE; + continue; + } + + de = (struct ext4_dir_entry_2 *)(dir_buf + offset); + if (ext4_check_dir_entry(inode, filp, de, + iloc.bh, dir_buf, + inline_size, offset)) { + ret = stored; + goto out; + } + offset += ext4_rec_len_from_disk(de->rec_len, + inline_size); + if (le32_to_cpu(de->inode)) { + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + u64 version = filp->f_version; + + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type)); + if (error) + break; + if (version != filp->f_version) + goto revalidate; + stored++; + } + filp->f_pos += ext4_rec_len_from_disk(de->rec_len, + inline_size); + } + offset = 0; + } +out: + kfree(dir_buf); + brelse(iloc.bh); + return ret; +} + +struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval) +{ + struct ext4_iloc iloc; + + *retval = ext4_get_inode_loc(inode, &iloc); + if (*retval) + return NULL; + + *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + + return iloc.bh; +} + +/* + * Try to create the inline data for the new dir. + * If it succeeds, return 0, otherwise return the error. + * In case of ENOSPC, the caller should create the normal disk layout dir. + */ +int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, + struct inode *inode) +{ + int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE; + struct ext4_iloc iloc; + struct ext4_dir_entry_2 *de; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ret; + + ret = ext4_prepare_inline_data(handle, inode, inline_size); + if (ret) + goto out; + + /* + * For inline dir, we only save the inode information for the ".." + * and create a fake dentry to cover the left space. + */ + de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + de->inode = cpu_to_le32(parent->i_ino); + de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE); + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk( + inline_size - EXT4_INLINE_DOTDOT_SIZE, + inline_size); + set_nlink(inode, 2); + inode->i_size = EXT4_I(inode)->i_disksize = inline_size; +out: + brelse(iloc.bh); + return ret; +} + +struct buffer_head *ext4_find_inline_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data) +{ + int ret; + struct ext4_iloc iloc; + void *inline_start; + int inline_size; + + if (ext4_get_inode_loc(dir, &iloc)) + return NULL; + + down_read(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; + ret = search_dir(iloc.bh, inline_start, inline_size, + dir, d_name, 0, res_dir); + if (ret == 1) + goto out_find; + if (ret < 0) + goto out; + + if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE) + goto out; + + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; + + ret = search_dir(iloc.bh, inline_start, inline_size, + dir, d_name, 0, res_dir); + if (ret == 1) + goto out_find; + +out: + brelse(iloc.bh); + iloc.bh = NULL; +out_find: + up_read(&EXT4_I(dir)->xattr_sem); + return iloc.bh; +} + +int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data) +{ + int err, inline_size; + struct ext4_iloc iloc; + void *inline_start; + + err = ext4_get_inode_loc(dir, &iloc); + if (err) + return err; + + down_write(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) < + EXT4_MIN_INLINE_DATA_SIZE) { + inline_start = (void *)ext4_raw_inode(&iloc)->i_block + + EXT4_INLINE_DOTDOT_SIZE; + inline_size = EXT4_MIN_INLINE_DATA_SIZE - + EXT4_INLINE_DOTDOT_SIZE; + } else { + inline_start = ext4_get_inline_xattr_pos(dir, &iloc); + inline_size = ext4_get_inline_size(dir) - + EXT4_MIN_INLINE_DATA_SIZE; + } + + err = ext4_journal_get_write_access(handle, bh); + if (err) + goto out; + + err = ext4_generic_delete_entry(handle, dir, de_del, bh, + inline_start, inline_size, 0); + if (err) + goto out; + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_mark_inode_dirty(handle, dir); + if (unlikely(err)) + goto out; + + ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size); +out: + up_write(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + if (err != -ENOENT) + ext4_std_error(dir->i_sb, err); + return err; +} + +/* + * Get the inline dentry at offset. + */ +static inline struct ext4_dir_entry_2 * +ext4_get_inline_entry(struct inode *inode, + struct ext4_iloc *iloc, + unsigned int offset, + void **inline_start, + int *inline_size) +{ + void *inline_pos; + + BUG_ON(offset > ext4_get_inline_size(inode)); + + if (offset < EXT4_MIN_INLINE_DATA_SIZE) { + inline_pos = (void *)ext4_raw_inode(iloc)->i_block; + *inline_size = EXT4_MIN_INLINE_DATA_SIZE; + } else { + inline_pos = ext4_get_inline_xattr_pos(inode, iloc); + offset -= EXT4_MIN_INLINE_DATA_SIZE; + *inline_size = ext4_get_inline_size(inode) - + EXT4_MIN_INLINE_DATA_SIZE; + } + + if (inline_start) + *inline_start = inline_pos; + return (struct ext4_dir_entry_2 *)(inline_pos + offset); +} + +int empty_inline_dir(struct inode *dir, int *has_inline_data) +{ + int err, inline_size; + struct ext4_iloc iloc; + void *inline_pos; + unsigned int offset; + struct ext4_dir_entry_2 *de; + int ret = 1; + + err = ext4_get_inode_loc(dir, &iloc); + if (err) { + EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", + err, dir->i_ino); + return 1; + } + + down_read(&EXT4_I(dir)->xattr_sem); + if (!ext4_has_inline_data(dir)) { + *has_inline_data = 0; + goto out; + } + + de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; + if (!le32_to_cpu(de->inode)) { + ext4_warning(dir->i_sb, + "bad inline directory (dir #%lu) - no `..'", + dir->i_ino); + ret = 1; + goto out; + } + + offset = EXT4_INLINE_DOTDOT_SIZE; + while (offset < dir->i_size) { + de = ext4_get_inline_entry(dir, &iloc, offset, + &inline_pos, &inline_size); + if (ext4_check_dir_entry(dir, NULL, de, + iloc.bh, inline_pos, + inline_size, offset)) { + ext4_warning(dir->i_sb, + "bad inline directory (dir #%lu) - " + "inode %u, rec_len %u, name_len %d" + "inline size %d\n", + dir->i_ino, le32_to_cpu(de->inode), + le16_to_cpu(de->rec_len), de->name_len, + inline_size); + ret = 1; + goto out; + } + if (le32_to_cpu(de->inode)) { + ret = 0; + goto out; + } + offset += ext4_rec_len_from_disk(de->rec_len, inline_size); + } + +out: + up_read(&EXT4_I(dir)->xattr_sem); + brelse(iloc.bh); + return ret; +} + +int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) +{ + int ret; + + down_write(&EXT4_I(inode)->xattr_sem); + ret = ext4_destroy_inline_data_nolock(handle, inode); + up_write(&EXT4_I(inode)->xattr_sem); + + return ret; +} + +int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline) +{ + __u64 physical = 0; + __u64 length; + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST; + int error = 0; + struct ext4_iloc iloc; + + down_read(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + *has_inline = 0; + goto out; + } + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + goto out; + + physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; + physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; + physical += offsetof(struct ext4_inode, i_block); + length = i_size_read(inode); + + if (physical) + error = fiemap_fill_next_extent(fieinfo, 0, physical, + length, flags); + brelse(iloc.bh); +out: + up_read(&EXT4_I(inode)->xattr_sem); + return (error < 0 ? error : 0); +} + +/* + * Called during xattr set, and if we can sparse space 'needed', + * just create the extent tree evict the data to the outer block. + * + * We use jbd2 instead of page cache to move data to the 1st block + * so that the whole transaction can be committed as a whole and + * the data isn't lost because of the delayed page cache write. + */ +int ext4_try_to_evict_inline_data(handle_t *handle, + struct inode *inode, + int needed) +{ + int error; + struct ext4_xattr_entry *entry; + struct ext4_xattr_ibody_header *header; + struct ext4_inode *raw_inode; + struct ext4_iloc iloc; + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + + raw_inode = ext4_raw_inode(&iloc); + header = IHDR(inode, raw_inode); + entry = (struct ext4_xattr_entry *)((void *)raw_inode + + EXT4_I(inode)->i_inline_off); + if (EXT4_XATTR_LEN(entry->e_name_len) + + EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) { + error = -ENOSPC; + goto out; + } + + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); +out: + brelse(iloc.bh); + return error; +} + +void ext4_inline_data_truncate(struct inode *inode, int *has_inline) +{ + handle_t *handle; + int inline_size, value_len, needed_blocks; + size_t i_size; + void *value = NULL; + struct ext4_xattr_ibody_find is = { + .s = { .not_found = -ENODATA, }, + }; + struct ext4_xattr_info i = { + .name_index = EXT4_XATTR_INDEX_SYSTEM, + .name = EXT4_XATTR_SYSTEM_DATA, + }; + + + needed_blocks = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) + return; + + down_write(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + *has_inline = 0; + ext4_journal_stop(handle); + return; + } + + if (ext4_orphan_add(handle, inode)) + goto out; + + if (ext4_get_inode_loc(inode, &is.iloc)) + goto out; + + down_write(&EXT4_I(inode)->i_data_sem); + i_size = inode->i_size; + inline_size = ext4_get_inline_size(inode); + EXT4_I(inode)->i_disksize = i_size; + + if (i_size < inline_size) { + /* Clear the content in the xattr space. */ + if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { + if (ext4_xattr_ibody_find(inode, &i, &is)) + goto out_error; + + BUG_ON(is.s.not_found); + + value_len = le32_to_cpu(is.s.here->e_value_size); + value = kmalloc(value_len, GFP_NOFS); + if (!value) + goto out_error; + + if (ext4_xattr_ibody_get(inode, i.name_index, i.name, + value, value_len)) + goto out_error; + + i.value = value; + i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? + i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; + if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is)) + goto out_error; + } + + /* Clear the content within i_blocks. */ + if (i_size < EXT4_MIN_INLINE_DATA_SIZE) + memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0, + EXT4_MIN_INLINE_DATA_SIZE - i_size); + + EXT4_I(inode)->i_inline_size = i_size < + EXT4_MIN_INLINE_DATA_SIZE ? + EXT4_MIN_INLINE_DATA_SIZE : i_size; + } + +out_error: + up_write(&EXT4_I(inode)->i_data_sem); +out: + brelse(is.iloc.bh); + up_write(&EXT4_I(inode)->xattr_sem); + kfree(value); + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + + ext4_journal_stop(handle); + return; +} + +int ext4_convert_inline_data(struct inode *inode) +{ + int error, needed_blocks; + handle_t *handle; + struct ext4_iloc iloc; + + if (!ext4_has_inline_data(inode)) { + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + return 0; + } + + needed_blocks = ext4_writepage_trans_blocks(inode); + + iloc.bh = NULL; + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto out_free; + } + + down_write(&EXT4_I(inode)->xattr_sem); + if (!ext4_has_inline_data(inode)) { + up_write(&EXT4_I(inode)->xattr_sem); + goto out; + } + + error = ext4_convert_inline_data_nolock(handle, inode, &iloc); + up_write(&EXT4_I(inode)->xattr_sem); +out: + ext4_journal_stop(handle); +out_free: + brelse(iloc.bh); + return error; +} diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b3c243b9afa..cb1c1ab2720 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -484,49 +484,6 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, } /* - * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map. - */ -static void set_buffers_da_mapped(struct inode *inode, - struct ext4_map_blocks *map) -{ - struct address_space *mapping = inode->i_mapping; - struct pagevec pvec; - int i, nr_pages; - pgoff_t index, end; - - index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits); - end = (map->m_lblk + map->m_len - 1) >> - (PAGE_CACHE_SHIFT - inode->i_blkbits); - - pagevec_init(&pvec, 0); - while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, - min(end - index + 1, - (pgoff_t)PAGEVEC_SIZE)); - if (nr_pages == 0) - break; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct buffer_head *bh, *head; - - if (unlikely(page->mapping != mapping) || - !PageDirty(page)) - break; - - if (page_has_buffers(page)) { - bh = head = page_buffers(page); - do { - set_buffer_da_mapped(bh); - bh = bh->b_this_page; - } while (bh != head); - } - index++; - } - pagevec_release(&pvec); - } -} - -/* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * @@ -574,7 +531,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, up_read((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, map); + int ret; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + /* delayed alloc may be allocated by fallocate and + * coverted to initialized by directIO. + * we need to handle delayed extent here. + */ + down_write((&EXT4_I(inode)->i_data_sem)); + goto delayed_mapped; + } + ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -652,12 +618,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); - /* If we have successfully mapped the delayed allocated blocks, - * set the BH_Da_Mapped bit on them. Its important to do this - * under the protection of i_data_sem. - */ - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) - set_buffers_da_mapped(inode, map); + if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { + int ret; +delayed_mapped: + /* delayed allocation blocks has been allocated */ + ret = ext4_es_remove_extent(inode, map->m_lblk, + map->m_len); + if (ret < 0) + retval = ret; + } } up_write((&EXT4_I(inode)->i_data_sem)); @@ -680,10 +649,13 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, int ret = 0, started = 0; int dio_credits; + if (ext4_has_inline_data(inode)) + return -ERANGE; + map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; - if (flags && !handle) { + if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) { /* Direct IO write... */ if (map.m_len > DIO_MAX_BLOCKS) map.m_len = DIO_MAX_BLOCKS; @@ -798,13 +770,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, return NULL; } -static int walk_page_buffers(handle_t *handle, - struct buffer_head *head, - unsigned from, - unsigned to, - int *partial, - int (*fn)(handle_t *handle, - struct buffer_head *bh)) +int ext4_walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; @@ -854,8 +826,8 @@ static int walk_page_buffers(handle_t *handle, * is elevated. We'll still have enough credits for the tiny quotafile * write. */ -static int do_journal_get_write_access(handle_t *handle, - struct buffer_head *bh) +int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) { int dirty = buffer_dirty(bh); int ret; @@ -878,7 +850,7 @@ static int do_journal_get_write_access(handle_t *handle, return ret; } -static int ext4_get_block_write(struct inode *inode, sector_t iblock, +static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -902,6 +874,17 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, + flags, pagep); + if (ret < 0) + goto out; + if (ret == 1) { + ret = 0; + goto out; + } + } + retry: handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { @@ -919,6 +902,7 @@ retry: ret = -ENOMEM; goto out; } + *pagep = page; if (ext4_should_dioread_nolock(inode)) @@ -927,8 +911,9 @@ retry: ret = __block_write_begin(page, pos, len, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { - ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, do_journal_get_write_access); + ret = ext4_walk_page_buffers(handle, page_buffers(page), + from, to, NULL, + do_journal_get_write_access); } if (ret) { @@ -983,7 +968,12 @@ static int ext4_generic_write_end(struct file *file, struct inode *inode = mapping->host; handle_t *handle = ext4_journal_current_handle(); - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ext4_has_inline_data(inode)) + copied = ext4_write_inline_data_end(inode, pos, len, + copied, page); + else + copied = block_write_end(file, mapping, pos, + len, copied, page, fsdata); /* * No need to use i_size_read() here, the i_size @@ -1134,16 +1124,21 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); - if (copied < len) { - if (!PageUptodate(page)) - copied = 0; - page_zero_new_buffers(page, from+copied, to); - } + if (ext4_has_inline_data(inode)) + copied = ext4_write_inline_data_end(inode, pos, len, + copied, page); + else { + if (copied < len) { + if (!PageUptodate(page)) + copied = 0; + page_zero_new_buffers(page, from+copied, to); + } - ret = walk_page_buffers(handle, page_buffers(page), from, - to, &partial, write_end_fn); - if (!partial) - SetPageUptodate(page); + ret = ext4_walk_page_buffers(handle, page_buffers(page), from, + to, &partial, write_end_fn); + if (!partial) + SetPageUptodate(page); + } new_i_size = pos + copied; if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); @@ -1301,6 +1296,7 @@ static void ext4_da_page_release_reservation(struct page *page, struct inode *inode = page->mapping->host; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int num_clusters; + ext4_fsblk_t lblk; head = page_buffers(page); bh = head; @@ -1310,20 +1306,23 @@ static void ext4_da_page_release_reservation(struct page *page, if ((offset <= curr_off) && (buffer_delay(bh))) { to_release++; clear_buffer_delay(bh); - clear_buffer_da_mapped(bh); } curr_off = next_off; } while ((bh = bh->b_this_page) != head); + if (to_release) { + lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + ext4_es_remove_extent(inode, lblk, to_release); + } + /* If we have released all the blocks belonging to a cluster, then we * need to release the reserved space for that cluster. */ num_clusters = EXT4_NUM_B2C(sbi, to_release); while (num_clusters > 0) { - ext4_fsblk_t lblk; lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) + ((num_clusters - 1) << sbi->s_cluster_bits); if (sbi->s_cluster_ratio == 1 || - !ext4_find_delalloc_cluster(inode, lblk, 1)) + !ext4_find_delalloc_cluster(inode, lblk)) ext4_da_release_space(inode, 1); num_clusters--; @@ -1429,8 +1428,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, clear_buffer_delay(bh); bh->b_blocknr = pblock; } - if (buffer_da_mapped(bh)) - clear_buffer_da_mapped(bh); if (buffer_unwritten(bh) || buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); @@ -1500,9 +1497,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) struct pagevec pvec; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; + ext4_lblk_t start, last; index = mpd->first_page; end = mpd->next_page - 1; + + start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); + ext4_es_remove_extent(inode, start, last - start + 1); + + pagevec_init(&pvec, 0); while (index <= end) { nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); if (nr_pages == 0) @@ -1656,15 +1660,6 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd) for (i = 0; i < map.m_len; i++) unmap_underlying_metadata(bdev, map.m_pblk + i); - - if (ext4_should_order_data(mpd->inode)) { - err = ext4_jbd2_file_inode(handle, mpd->inode); - if (err) { - /* Only if the journal is aborted */ - mpd->retval = err; - goto submit_io; - } - } } /* @@ -1795,7 +1790,19 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, * file system block. */ down_read((&EXT4_I(inode)->i_data_sem)); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + if (ext4_has_inline_data(inode)) { + /* + * We will soon create blocks for this page, and let + * us pretend as if the blocks aren't allocated yet. + * In case of clusters, we have to handle the work + * of mapping from cluster so that the reserved space + * is calculated properly. + */ + if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) && + ext4_find_delalloc_cluster(inode, map->m_lblk)) + map->m_flags |= EXT4_MAP_FROM_CLUSTER; + retval = 0; + } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) retval = ext4_ext_map_blocks(NULL, inode, map, 0); else retval = ext4_ind_map_blocks(NULL, inode, map, 0); @@ -1814,6 +1821,10 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, goto out_unlock; } + retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len); + if (retval) + goto out_unlock; + /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served * and it should not appear on the bh->b_state. */ @@ -1842,8 +1853,8 @@ out_unlock: * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev * initialized properly. */ -static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) +int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) { struct ext4_map_blocks map; int ret = 0; @@ -1917,15 +1928,29 @@ static int __ext4_journalled_writepage(struct page *page, { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; - struct buffer_head *page_bufs; + struct buffer_head *page_bufs = NULL; handle_t *handle = NULL; - int ret = 0; - int err; + int ret = 0, err = 0; + int inline_data = ext4_has_inline_data(inode); + struct buffer_head *inode_bh = NULL; ClearPageChecked(page); - page_bufs = page_buffers(page); - BUG_ON(!page_bufs); - walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); + + if (inline_data) { + BUG_ON(page->index != 0); + BUG_ON(len > ext4_get_max_inline_size(inode)); + inode_bh = ext4_journalled_write_inline_data(inode, len, page); + if (inode_bh == NULL) + goto out; + } else { + page_bufs = page_buffers(page); + if (!page_bufs) { + BUG(); + goto out; + } + ext4_walk_page_buffers(handle, page_bufs, 0, len, + NULL, bget_one); + } /* As soon as we unlock the page, it can go away, but we have * references to buffers so we are safe */ unlock_page(page); @@ -1938,11 +1963,18 @@ static int __ext4_journalled_writepage(struct page *page, BUG_ON(!ext4_handle_valid(handle)); - ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, - do_journal_get_write_access); + if (inline_data) { + ret = ext4_journal_get_write_access(handle, inode_bh); + + err = ext4_handle_dirty_metadata(handle, inode, inode_bh); - err = walk_page_buffers(handle, page_bufs, 0, len, NULL, - write_end_fn); + } else { + ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, + do_journal_get_write_access); + + err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, + write_end_fn); + } if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; @@ -1950,9 +1982,12 @@ static int __ext4_journalled_writepage(struct page *page, if (!ret) ret = err; - walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); + if (!ext4_has_inline_data(inode)) + ext4_walk_page_buffers(handle, page_bufs, 0, len, + NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: + brelse(inode_bh); return ret; } @@ -2029,8 +2064,8 @@ static int ext4_writepage(struct page *page, commit_write = 1; } page_bufs = page_buffers(page); - if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { + if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, + ext4_bh_delay_or_unwritten)) { /* * We don't want to do block allocation, so redirty * the page and return. We may reach here when we do @@ -2096,7 +2131,8 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) * mpage_da_map_and_submit to map a single contiguous memory region * and then write them. */ -static int write_cache_pages_da(struct address_space *mapping, +static int write_cache_pages_da(handle_t *handle, + struct address_space *mapping, struct writeback_control *wbc, struct mpage_da_data *mpd, pgoff_t *done_index) @@ -2175,6 +2211,17 @@ static int write_cache_pages_da(struct address_space *mapping, wait_on_page_writeback(page); BUG_ON(PageWriteback(page)); + /* + * If we have inline data and arrive here, it means that + * we will soon create the block for the 1st page, so + * we'd better clear the inline data here. + */ + if (ext4_has_inline_data(inode)) { + BUG_ON(ext4_test_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA)); + ext4_destroy_inline_data(handle, inode); + } + if (mpd->next_page != page->index) mpd->first_page = page->index; mpd->next_page = page->index + 1; @@ -2381,7 +2428,8 @@ retry: * contiguous region of logical blocks that need * blocks to be allocated by ext4 and submit them. */ - ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); + ret = write_cache_pages_da(handle, mapping, + wbc, &mpd, &done_index); /* * If we have a contiguous extent of pages and we * haven't done the I/O yet, map the blocks and submit @@ -2445,7 +2493,6 @@ out_writepages: return ret; } -#define FALL_BACK_TO_NONDELALLOC 1 static int ext4_nonda_switch(struct super_block *sb) { s64 free_blocks, dirty_blocks; @@ -2502,6 +2549,19 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len, flags); + + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + ret = ext4_da_write_inline_data_begin(mapping, inode, + pos, len, flags, + pagep, fsdata); + if (ret < 0) + goto out; + if (ret == 1) { + ret = 0; + goto out; + } + } + retry: /* * With delayed allocation, we don't log the i_disksize update @@ -2603,22 +2663,13 @@ static int ext4_da_write_end(struct file *file, * changes. So let's piggyback the i_disksize mark_inode_dirty * into that. */ - new_i_size = pos + copied; if (copied && new_i_size > EXT4_I(inode)->i_disksize) { - if (ext4_da_should_update_i_disksize(page, end)) { + if (ext4_has_inline_data(inode) || + ext4_da_should_update_i_disksize(page, end)) { down_write(&EXT4_I(inode)->i_data_sem); - if (new_i_size > EXT4_I(inode)->i_disksize) { - /* - * Updating i_disksize when extending file - * without needing block allocation - */ - if (ext4_should_order_data(inode)) - ret = ext4_jbd2_file_inode(handle, - inode); - + if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - } up_write(&EXT4_I(inode)->i_data_sem); /* We need to mark inode dirty even if * new_i_size is less that inode->i_size @@ -2627,8 +2678,16 @@ static int ext4_da_write_end(struct file *file, ext4_mark_inode_dirty(handle, inode); } } - ret2 = generic_write_end(file, mapping, pos, len, copied, + + if (write_mode != CONVERT_INLINE_DATA && + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && + ext4_has_inline_data(inode)) + ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied, + page); + else + ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + copied = ret2; if (ret2 < 0) ret = ret2; @@ -2721,6 +2780,12 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) journal_t *journal; int err; + /* + * We can get here for an inline file via the FIBMAP ioctl + */ + if (ext4_has_inline_data(inode)) + return 0; + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && test_opt(inode->i_sb, DELALLOC)) { /* @@ -2766,14 +2831,30 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) static int ext4_readpage(struct file *file, struct page *page) { + int ret = -EAGAIN; + struct inode *inode = page->mapping->host; + trace_ext4_readpage(page); - return mpage_readpage(page, ext4_get_block); + + if (ext4_has_inline_data(inode)) + ret = ext4_readpage_inline(inode, page); + + if (ret == -EAGAIN) + return mpage_readpage(page, ext4_get_block); + + return ret; } static int ext4_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { + struct inode *inode = mapping->host; + + /* If the file has inline data, no need to do readpages. */ + if (ext4_has_inline_data(inode)) + return 0; + return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); } @@ -2840,7 +2921,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) * We allocate an uinitialized extent if blocks haven't been allocated. * The extent will be converted to initialized after the IO is complete. */ -static int ext4_get_block_write(struct inode *inode, sector_t iblock, +int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", @@ -2850,29 +2931,12 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock, } static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int flags) + struct buffer_head *bh_result, int create) { - handle_t *handle = ext4_journal_current_handle(); - struct ext4_map_blocks map; - int ret = 0; - - ext4_debug("ext4_get_block_write_nolock: inode %lu, flag %d\n", - inode->i_ino, flags); - - flags = EXT4_GET_BLOCKS_NO_LOCK; - - map.m_lblk = iblock; - map.m_len = bh_result->b_size >> inode->i_blkbits; - - ret = ext4_map_blocks(handle, inode, &map, flags); - if (ret > 0) { - map_bh(bh_result, inode->i_sb, map.m_pblk); - bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) | - map.m_flags; - bh_result->b_size = inode->i_sb->s_blocksize * map.m_len; - ret = 0; - } - return ret; + ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n", + inode->i_ino, create); + return _ext4_get_block(inode, iblock, bh_result, + EXT4_GET_BLOCKS_NO_LOCK); } static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, @@ -2978,10 +3042,10 @@ retry: * fall back to buffered IO. * * For holes, we fallocate those blocks, mark them as uninitialized - * If those blocks were preallocated, we mark sure they are splited, but + * If those blocks were preallocated, we mark sure they are split, but * still keep the range to write as uninitialized. * - * The unwrritten extents will be converted to written when DIO is completed. + * The unwritten extents will be converted to written when DIO is completed. * For async direct IO, since the IO may still pending when return, we * set up an end_io call back function, which will do the conversion * when async direct IO completed. @@ -2999,125 +3063,120 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, struct inode *inode = file->f_mapping->host; ssize_t ret; size_t count = iov_length(iov, nr_segs); - + int overwrite = 0; + get_block_t *get_block_func = NULL; + int dio_flags = 0; loff_t final_size = offset + count; - if (rw == WRITE && final_size <= inode->i_size) { - int overwrite = 0; - BUG_ON(iocb->private == NULL); + /* Use the old path for reads and writes beyond i_size. */ + if (rw != WRITE || final_size > inode->i_size) + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); - /* If we do a overwrite dio, i_mutex locking can be released */ - overwrite = *((int *)iocb->private); + BUG_ON(iocb->private == NULL); - if (overwrite) { - atomic_inc(&inode->i_dio_count); - down_read(&EXT4_I(inode)->i_data_sem); - mutex_unlock(&inode->i_mutex); - } + /* If we do a overwrite dio, i_mutex locking can be released */ + overwrite = *((int *)iocb->private); - /* - * We could direct write to holes and fallocate. - * - * Allocated blocks to fill the hole are marked as uninitialized - * to prevent parallel buffered read to expose the stale data - * before DIO complete the data IO. - * - * As to previously fallocated extents, ext4 get_block - * will just simply mark the buffer mapped but still - * keep the extents uninitialized. - * - * for non AIO case, we will convert those unwritten extents - * to written after return back from blockdev_direct_IO. - * - * for async DIO, the conversion needs to be defered when - * the IO is completed. The ext4 end_io callback function - * will be called to take care of the conversion work. - * Here for async case, we allocate an io_end structure to - * hook to the iocb. - */ - iocb->private = NULL; - ext4_inode_aio_set(inode, NULL); - if (!is_sync_kiocb(iocb)) { - ext4_io_end_t *io_end = - ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) { - ret = -ENOMEM; - goto retake_lock; - } - io_end->flag |= EXT4_IO_END_DIRECT; - iocb->private = io_end; - /* - * we save the io structure for current async - * direct IO, so that later ext4_map_blocks() - * could flag the io structure whether there - * is a unwritten extents needs to be converted - * when IO is completed. - */ - ext4_inode_aio_set(inode, io_end); - } + if (overwrite) { + atomic_inc(&inode->i_dio_count); + down_read(&EXT4_I(inode)->i_data_sem); + mutex_unlock(&inode->i_mutex); + } - if (overwrite) - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block_write_nolock, - ext4_end_io_dio, - NULL, - 0); - else - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block_write, - ext4_end_io_dio, - NULL, - DIO_LOCKING); - if (iocb->private) - ext4_inode_aio_set(inode, NULL); + /* + * We could direct write to holes and fallocate. + * + * Allocated blocks to fill the hole are marked as + * uninitialized to prevent parallel buffered read to expose + * the stale data before DIO complete the data IO. + * + * As to previously fallocated extents, ext4 get_block will + * just simply mark the buffer mapped but still keep the + * extents uninitialized. + * + * For non AIO case, we will convert those unwritten extents + * to written after return back from blockdev_direct_IO. + * + * For async DIO, the conversion needs to be deferred when the + * IO is completed. The ext4 end_io callback function will be + * called to take care of the conversion work. Here for async + * case, we allocate an io_end structure to hook to the iocb. + */ + iocb->private = NULL; + ext4_inode_aio_set(inode, NULL); + if (!is_sync_kiocb(iocb)) { + ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) { + ret = -ENOMEM; + goto retake_lock; + } + io_end->flag |= EXT4_IO_END_DIRECT; + iocb->private = io_end; /* - * The io_end structure takes a reference to the inode, - * that structure needs to be destroyed and the - * reference to the inode need to be dropped, when IO is - * complete, even with 0 byte write, or failed. - * - * In the successful AIO DIO case, the io_end structure will be - * desctroyed and the reference to the inode will be dropped - * after the end_io call back function is called. - * - * In the case there is 0 byte write, or error case, since - * VFS direct IO won't invoke the end_io call back function, - * we need to free the end_io structure here. + * we save the io structure for current async direct + * IO, so that later ext4_map_blocks() could flag the + * io structure whether there is a unwritten extents + * needs to be converted when IO is completed. */ - if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { - ext4_free_io_end(iocb->private); - iocb->private = NULL; - } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, - EXT4_STATE_DIO_UNWRITTEN)) { - int err; - /* - * for non AIO case, since the IO is already - * completed, we could do the conversion right here - */ - err = ext4_convert_unwritten_extents(inode, - offset, ret); - if (err < 0) - ret = err; - ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); - } + ext4_inode_aio_set(inode, io_end); + } - retake_lock: - /* take i_mutex locking again if we do a ovewrite dio */ - if (overwrite) { - inode_dio_done(inode); - up_read(&EXT4_I(inode)->i_data_sem); - mutex_lock(&inode->i_mutex); - } + if (overwrite) { + get_block_func = ext4_get_block_write_nolock; + } else { + get_block_func = ext4_get_block_write; + dio_flags = DIO_LOCKING; + } + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + get_block_func, + ext4_end_io_dio, + NULL, + dio_flags); + + if (iocb->private) + ext4_inode_aio_set(inode, NULL); + /* + * The io_end structure takes a reference to the inode, that + * structure needs to be destroyed and the reference to the + * inode need to be dropped, when IO is complete, even with 0 + * byte write, or failed. + * + * In the successful AIO DIO case, the io_end structure will + * be destroyed and the reference to the inode will be dropped + * after the end_io call back function is called. + * + * In the case there is 0 byte write, or error case, since VFS + * direct IO won't invoke the end_io call back function, we + * need to free the end_io structure here. + */ + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { + ext4_free_io_end(iocb->private); + iocb->private = NULL; + } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, + EXT4_STATE_DIO_UNWRITTEN)) { + int err; + /* + * for non AIO case, since the IO is already + * completed, we could do the conversion right here + */ + err = ext4_convert_unwritten_extents(inode, + offset, ret); + if (err < 0) + ret = err; + ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + } - return ret; +retake_lock: + /* take i_mutex locking again if we do a ovewrite dio */ + if (overwrite) { + inode_dio_done(inode); + up_read(&EXT4_I(inode)->i_data_sem); + mutex_lock(&inode->i_mutex); } - /* for write the the end of file case, we fall back to old way */ - return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + return ret; } static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, @@ -3134,6 +3193,10 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, if (ext4_should_journal_data(inode)) return 0; + /* Let buffer I/O handle the inline data case. */ + if (ext4_has_inline_data(inode)) + return 0; + trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); @@ -3531,6 +3594,14 @@ void ext4_truncate(struct inode *inode) if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + if (ext4_has_inline_data(inode)) { + int has_inline = 1; + + ext4_inline_data_truncate(inode, &has_inline); + if (has_inline) + return; + } + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ext4_ext_truncate(inode); else @@ -3756,6 +3827,19 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, } } +static inline void ext4_iget_extra_inode(struct inode *inode, + struct ext4_inode *raw_inode, + struct ext4_inode_info *ei) +{ + __le32 *magic = (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; + if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + ext4_find_inline_data_nolock(inode); + } else + EXT4_I(inode)->i_inline_off = 0; +} + struct inode *ext4_iget(struct super_block *sb, unsigned long ino) { struct ext4_iloc iloc; @@ -3826,6 +3910,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ + ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. @@ -3898,11 +3983,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { - __le32 *magic = (void *)raw_inode + - EXT4_GOOD_OLD_INODE_SIZE + - ei->i_extra_isize; - if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) - ext4_set_inode_state(inode, EXT4_STATE_XATTR); + ext4_iget_extra_inode(inode, raw_inode, ei); } } @@ -3925,17 +4006,19 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_file_acl); ret = -EIO; goto bad_inode; - } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode))) - /* Validate extent which is part of inode */ - ret = ext4_ext_check_inode(inode); - } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - (S_ISLNK(inode->i_mode) && - !ext4_inode_is_fast_symlink(inode))) { - /* Validate block references which are part of inode */ - ret = ext4_ind_check_inode(inode); + } else if (!ext4_has_inline_data(inode)) { + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode)))) + /* Validate extent which is part of inode */ + ret = ext4_ext_check_inode(inode); + } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) { + /* Validate block references which are part of inode */ + ret = ext4_ind_check_inode(inode); + } } if (ret) goto bad_inode; @@ -4122,9 +4205,10 @@ static int ext4_do_update_inode(handle_t *handle, cpu_to_le32(new_encode_dev(inode->i_rdev)); raw_inode->i_block[2] = 0; } - } else + } else if (!ext4_has_inline_data(inode)) { for (block = 0; block < EXT4_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; + } raw_inode->i_disk_version = cpu_to_le32(inode->i_version); if (ei->i_extra_isize) { @@ -4811,8 +4895,9 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) * journal_start/journal_stop which can block and take a long time */ if (page_has_buffers(page)) { - if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_unmapped)) { + if (!ext4_walk_page_buffers(NULL, page_buffers(page), + 0, len, NULL, + ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ wait_on_page_writeback(page); ret = VM_FAULT_LOCKED; @@ -4833,7 +4918,7 @@ retry_alloc: } ret = __block_page_mkwrite(vma, vmf, get_block); if (!ret && ext4_should_journal_data(inode)) { - if (walk_page_buffers(handle, page_buffers(page), 0, + if (ext4_walk_page_buffers(handle, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { unlock_page(page); ret = VM_FAULT_SIGBUS; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 526e5535860..1bf6fe785c4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1373,7 +1373,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block, ex->fe_start += next; while (needed > ex->fe_len && - (buddy = mb_find_buddy(e4b, order, &max))) { + mb_find_buddy(e4b, order, &max)) { if (block + 1 >= max) break; @@ -2607,9 +2607,17 @@ static void ext4_free_data_callback(struct super_block *sb, mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, entry->efd_group, - entry->efd_start_cluster, entry->efd_count); + if (test_opt(sb, DISCARD)) { + err = ext4_issue_discard(sb, entry->efd_group, + entry->efd_start_cluster, + entry->efd_count); + if (err && err != -EOPNOTSUPP) + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%d failed" + " with %d", entry->efd_group, + entry->efd_start_cluster, + entry->efd_count, err); + } err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -4310,8 +4318,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, repeat: /* allocate space in core */ *errp = ext4_mb_regular_allocator(ac); - if (*errp) + if (*errp) { + ext4_discard_allocated_blocks(ac); goto errout; + } /* as we've just preallocated more space than * user requested orinally, we store allocated @@ -4333,10 +4343,10 @@ repeat: ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; goto repeat; - } else if (*errp) - errout: + } else if (*errp) { ext4_discard_allocated_blocks(ac); - else { + goto errout; + } else { block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); ar->len = ac->ac_b_ex.fe_len; } @@ -4347,6 +4357,7 @@ repeat: *errp = -ENOSPC; } +errout: if (*errp) { ac->ac_b_ex.fe_len = 0; ar->len = 0; @@ -4656,8 +4667,16 @@ do_more: * with group lock held. generate_buddy look at * them with group lock_held */ - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, block_group, bit, count); + if (test_opt(sb, DISCARD)) { + err = ext4_issue_discard(sb, block_group, bit, count); + if (err && err != -EOPNOTSUPP) + ext4_msg(sb, KERN_WARNING, "discard request in" + " group:%d block:%d count:%lu failed" + " with %d", block_group, bit, count, + err); + } + + ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); mb_free_blocks(inode, &e4b, bit, count_clusters); @@ -4851,10 +4870,11 @@ error_return: * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ -static void ext4_trim_extent(struct super_block *sb, int start, int count, +static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_group_t group, struct ext4_buddy *e4b) { struct ext4_free_extent ex; + int ret = 0; trace_ext4_trim_extent(sb, group, start, count); @@ -4870,9 +4890,10 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); - ext4_issue_discard(sb, group, start, count); + ret = ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); + return ret; } /** @@ -4901,7 +4922,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, void *bitmap; ext4_grpblk_t next, count = 0, free_count = 0; struct ext4_buddy e4b; - int ret; + int ret = 0; trace_ext4_trim_all_free(sb, group, start, max); @@ -4928,8 +4949,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, next = mb_find_next_bit(bitmap, max + 1, start); if ((next - start) >= minblocks) { - ext4_trim_extent(sb, start, - next - start, group, &e4b); + ret = ext4_trim_extent(sb, start, + next - start, group, &e4b); + if (ret && ret != -EOPNOTSUPP) + break; + ret = 0; count += next - start; } free_count += next - start; @@ -4950,8 +4974,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, break; } - if (!ret) + if (!ret) { + ret = count; EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); + } out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -4959,7 +4985,7 @@ out: ext4_debug("trimmed %d blocks in the group %d\n", count, group); - return count; + return ret; } /** diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index f1bb32ec016..db8226d595f 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include "ext4_jbd2.h" +#include "ext4_extents.h" /* * The contiguous blocks details which can be diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 292daeeed45..d9cc5ee42f5 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include "ext4_jbd2.h" #include "ext4.h" +#include "ext4_extents.h" /** * get_ext_path - Find an extent path for designated logical block number. diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6d600a69fc9..cac44828233 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -202,13 +202,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); /* checksumming functions */ -#define EXT4_DIRENT_TAIL(block, blocksize) \ - ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ - ((blocksize) - \ - sizeof(struct ext4_dir_entry_tail)))) - -static void initialize_dirent_tail(struct ext4_dir_entry_tail *t, - unsigned int blocksize) +void initialize_dirent_tail(struct ext4_dir_entry_tail *t, + unsigned int blocksize) { memset(t, 0, sizeof(struct ext4_dir_entry_tail)); t->det_rec_len = ext4_rec_len_to_disk( @@ -261,6 +256,12 @@ static __le32 ext4_dirent_csum(struct inode *inode, return cpu_to_le32(csum); } +static void warn_no_space_for_csum(struct inode *inode) +{ + ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for " + "checksum. Please run e2fsck -D.", inode->i_ino); +} + int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) { struct ext4_dir_entry_tail *t; @@ -271,8 +272,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) t = get_dirent_tail(inode, dirent); if (!t) { - EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " - "leaf for checksum. Please run e2fsck -D."); + warn_no_space_for_csum(inode); return 0; } @@ -294,8 +294,7 @@ static void ext4_dirent_csum_set(struct inode *inode, t = get_dirent_tail(inode, dirent); if (!t) { - EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir " - "leaf for checksum. Please run e2fsck -D."); + warn_no_space_for_csum(inode); return; } @@ -303,9 +302,9 @@ static void ext4_dirent_csum_set(struct inode *inode, (void *)t - (void *)dirent); } -static inline int ext4_handle_dirty_dirent_node(handle_t *handle, - struct inode *inode, - struct buffer_head *bh) +int ext4_handle_dirty_dirent_node(handle_t *handle, + struct inode *inode, + struct buffer_head *bh) { ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data); return ext4_handle_dirty_metadata(handle, inode, bh); @@ -377,8 +376,7 @@ static int ext4_dx_csum_verify(struct inode *inode, count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { - EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " - "tree checksum found. Run e2fsck -D."); + warn_no_space_for_csum(inode); return 1; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); @@ -408,8 +406,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { - EXT4_ERROR_INODE(inode, "metadata_csum set but no space for " - "tree checksum. Run e2fsck -D."); + warn_no_space_for_csum(inode); return; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); @@ -890,6 +887,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, EXT4_DIR_REC_LEN(0)); for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, + bh->b_data, bh->b_size, (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) + ((char *)de - bh->b_data))) { /* On error, skip the f_pos to the next block. */ @@ -1007,6 +1005,15 @@ errout: return (err); } +static inline int search_dirblock(struct buffer_head *bh, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir) +{ + return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, + d_name, offset, res_dir); +} /* * Directory block splitting, compacting @@ -1081,13 +1088,6 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) dx_set_count(entries, count + 1); } -static void ext4_update_dx_flag(struct inode *inode) -{ - if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_COMPAT_DIR_INDEX)) - ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); -} - /* * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. * @@ -1107,11 +1107,13 @@ static inline int ext4_match (int len, const char * const name, /* * Returns 0 if not found, -1 on failure, and 1 on success */ -static inline int search_dirblock(struct buffer_head *bh, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 ** res_dir) +int search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; @@ -1119,8 +1121,8 @@ static inline int search_dirblock(struct buffer_head *bh, const char *name = d_name->name; int namelen = d_name->len; - de = (struct ext4_dir_entry_2 *) bh->b_data; - dlimit = bh->b_data + dir->i_sb->s_blocksize; + de = (struct ext4_dir_entry_2 *)search_buf; + dlimit = search_buf + buf_size; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ @@ -1128,7 +1130,8 @@ static inline int search_dirblock(struct buffer_head *bh, if ((char *) de + namelen <= dlimit && ext4_match (namelen, name, de)) { /* found a match - just to be sure, do a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) + if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, + bh->b_size, offset)) return -1; *res_dir = de; return 1; @@ -1144,6 +1147,21 @@ static inline int search_dirblock(struct buffer_head *bh, return 0; } +static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, + struct ext4_dir_entry *de) +{ + struct super_block *sb = dir->i_sb; + + if (!is_dx(dir)) + return 0; + if (block == 0) + return 1; + if (de->inode == 0 && + ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) == + sb->s_blocksize) + return 1; + return 0; +} /* * ext4_find_entry() @@ -1158,7 +1176,8 @@ static inline int search_dirblock(struct buffer_head *bh, */ static struct buffer_head * ext4_find_entry (struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 ** res_dir) + struct ext4_dir_entry_2 **res_dir, + int *inlined) { struct super_block *sb; struct buffer_head *bh_use[NAMEI_RA_SIZE]; @@ -1179,6 +1198,18 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, namelen = d_name->len; if (namelen > EXT4_NAME_LEN) return NULL; + + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + ret = ext4_find_inline_entry(dir, d_name, res_dir, + &has_inline_data); + if (has_inline_data) { + if (inlined) + *inlined = 1; + return ret; + } + } + if ((namelen <= 2) && (name[0] == '.') && (name[1] == '.' || name[1] == '\0')) { /* @@ -1244,6 +1275,8 @@ restart: goto next; } if (!buffer_verified(bh) && + !is_dx_internal_node(dir, block, + (struct ext4_dir_entry *)bh->b_data) && !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) { EXT4_ERROR_INODE(dir, "checksumming directory " @@ -1361,7 +1394,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - bh = ext4_find_entry(dir, &dentry->d_name, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); inode = NULL; if (bh) { __u32 ino = le32_to_cpu(de->inode); @@ -1395,7 +1428,7 @@ struct dentry *ext4_get_parent(struct dentry *child) struct ext4_dir_entry_2 * de; struct buffer_head *bh; - bh = ext4_find_entry(child->d_inode, &dotdot, &de); + bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); @@ -1593,6 +1626,63 @@ errout: return NULL; } +int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + const char *name, int namelen, + struct ext4_dir_entry_2 **dest_de) +{ + struct ext4_dir_entry_2 *de; + unsigned short reclen = EXT4_DIR_REC_LEN(namelen); + int nlen, rlen; + unsigned int offset = 0; + char *top; + + de = (struct ext4_dir_entry_2 *)buf; + top = buf + buf_size - reclen; + while ((char *) de <= top) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + buf, buf_size, offset)) + return -EIO; + if (ext4_match(namelen, name, de)) + return -EEXIST; + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if ((de->inode ? rlen - nlen : rlen) >= reclen) + break; + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } + if ((char *) de > top) + return -ENOSPC; + + *dest_de = de; + return 0; +} + +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + const char *name, int namelen) +{ + + int nlen, rlen; + + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = + (struct ext4_dir_entry_2 *)((char *)de + nlen); + de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); + de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); + de = de1; + } + de->file_type = EXT4_FT_UNKNOWN; + de->inode = cpu_to_le32(inode->i_ino); + ext4_set_de_type(inode->i_sb, de, inode->i_mode); + de->name_len = namelen; + memcpy(de->name, name, namelen); +} /* * Add a new entry into a directory (leaf) block. If de is non-NULL, * it points to a directory entry which is guaranteed to be large @@ -1608,12 +1698,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; - unsigned int offset = 0; unsigned int blocksize = dir->i_sb->s_blocksize; unsigned short reclen; - int nlen, rlen, err; - char *top; int csum_size = 0; + int err; if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) @@ -1621,22 +1709,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, reclen = EXT4_DIR_REC_LEN(namelen); if (!de) { - de = (struct ext4_dir_entry_2 *)bh->b_data; - top = bh->b_data + (blocksize - csum_size) - reclen; - while ((char *) de <= top) { - if (ext4_check_dir_entry(dir, NULL, de, bh, offset)) - return -EIO; - if (ext4_match(namelen, name, de)) - return -EEXIST; - nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); - if ((de->inode? rlen - nlen: rlen) >= reclen) - break; - de = (struct ext4_dir_entry_2 *)((char *)de + rlen); - offset += rlen; - } - if ((char *) de > top) - return -ENOSPC; + err = ext4_find_dest_de(dir, inode, + bh, bh->b_data, blocksize - csum_size, + name, namelen, &de); + if (err) + return err; } BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, bh); @@ -1646,19 +1723,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, } /* By now the buffer is marked for journaling */ - nlen = EXT4_DIR_REC_LEN(de->name_len); - rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); - if (de->inode) { - struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen); - de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize); - de->rec_len = ext4_rec_len_to_disk(nlen, blocksize); - de = de1; - } - de->file_type = EXT4_FT_UNKNOWN; - de->inode = cpu_to_le32(inode->i_ino); - ext4_set_de_type(dir->i_sb, de, inode->i_mode); - de->name_len = namelen; - memcpy(de->name, name, namelen); + ext4_insert_dentry(inode, de, blocksize, name, namelen); + /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend @@ -1831,6 +1897,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, blocksize = sb->s_blocksize; if (!dentry->d_name.len) return -EINVAL; + + if (ext4_has_inline_data(dir)) { + retval = ext4_try_add_inline_entry(handle, dentry, inode); + if (retval < 0) + return retval; + if (retval == 1) { + retval = 0; + return retval; + } + } + if (is_dx(dir)) { retval = ext4_dx_add_entry(handle, dentry, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) @@ -2036,36 +2113,29 @@ cleanup: } /* - * ext4_delete_entry deletes a directory entry by merging it with the - * previous entry + * ext4_generic_delete_entry deletes a directory entry by merging it + * with the previous entry */ -static int ext4_delete_entry(handle_t *handle, - struct inode *dir, - struct ext4_dir_entry_2 *de_del, - struct buffer_head *bh) +int ext4_generic_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + void *entry_buf, + int buf_size, + int csum_size) { struct ext4_dir_entry_2 *de, *pde; unsigned int blocksize = dir->i_sb->s_blocksize; - int csum_size = 0; - int i, err; - - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) - csum_size = sizeof(struct ext4_dir_entry_tail); + int i; i = 0; pde = NULL; - de = (struct ext4_dir_entry_2 *) bh->b_data; - while (i < bh->b_size - csum_size) { - if (ext4_check_dir_entry(dir, NULL, de, bh, i)) + de = (struct ext4_dir_entry_2 *)entry_buf; + while (i < buf_size - csum_size) { + if (ext4_check_dir_entry(dir, NULL, de, bh, + bh->b_data, bh->b_size, i)) return -EIO; if (de == de_del) { - BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); - if (unlikely(err)) { - ext4_std_error(dir->i_sb, err); - return err; - } if (pde) pde->rec_len = ext4_rec_len_to_disk( ext4_rec_len_from_disk(pde->rec_len, @@ -2076,12 +2146,6 @@ static int ext4_delete_entry(handle_t *handle, else de->inode = 0; dir->i_version++; - BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_dirent_node(handle, dir, bh); - if (unlikely(err)) { - ext4_std_error(dir->i_sb, err); - return err; - } return 0; } i += ext4_rec_len_from_disk(de->rec_len, blocksize); @@ -2091,6 +2155,48 @@ static int ext4_delete_entry(handle_t *handle, return -ENOENT; } +static int ext4_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh) +{ + int err, csum_size = 0; + + if (ext4_has_inline_data(dir)) { + int has_inline_data = 1; + err = ext4_delete_inline_entry(handle, dir, de_del, bh, + &has_inline_data); + if (has_inline_data) + return err; + } + + if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); + if (unlikely(err)) + goto out; + + err = ext4_generic_delete_entry(handle, dir, de_del, + bh, bh->b_data, + dir->i_sb->s_blocksize, csum_size); + if (err) + goto out; + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_dirent_node(handle, dir, bh); + if (unlikely(err)) + goto out; + + return 0; +out: + if (err != -ENOENT) + ext4_std_error(dir->i_sb, err); + return err; +} + /* * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2, * since this indicates that nlinks count was previously 1. @@ -2211,21 +2317,95 @@ retry: return err; } -static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len) +{ + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; + de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), + blocksize); + strcpy(de->name, "."); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + + de = ext4_next_entry(de, blocksize); + de->inode = cpu_to_le32(parent_ino); + de->name_len = 2; + if (!dotdot_real_len) + de->rec_len = ext4_rec_len_to_disk(blocksize - + (csum_size + EXT4_DIR_REC_LEN(1)), + blocksize); + else + de->rec_len = ext4_rec_len_to_disk( + EXT4_DIR_REC_LEN(de->name_len), blocksize); + strcpy(de->name, ".."); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + + return ext4_next_entry(de, blocksize); +} + +static int ext4_init_new_dir(handle_t *handle, struct inode *dir, + struct inode *inode) { - handle_t *handle; - struct inode *inode; struct buffer_head *dir_block = NULL; struct ext4_dir_entry_2 *de; struct ext4_dir_entry_tail *t; unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; - int err, retries = 0; + int err; if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) csum_size = sizeof(struct ext4_dir_entry_tail); + if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + err = ext4_try_create_inline_dir(handle, dir, inode); + if (err < 0 && err != -ENOSPC) + goto out; + if (!err) + goto out; + } + + inode->i_size = EXT4_I(inode)->i_disksize = blocksize; + dir_block = ext4_bread(handle, inode, 0, 1, &err); + if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { + if (!err) { + err = -EIO; + ext4_error(inode->i_sb, + "Directory hole detected on inode %lu\n", + inode->i_ino); + } + goto out; + } + BUFFER_TRACE(dir_block, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir_block); + if (err) + goto out; + de = (struct ext4_dir_entry_2 *)dir_block->b_data; + ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); + set_nlink(inode, 2); + if (csum_size) { + t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); + initialize_dirent_tail(t, blocksize); + } + + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); + if (err) + goto out; + set_buffer_verified(dir_block); +out: + brelse(dir_block); + return err; +} + +static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + handle_t *handle; + struct inode *inode; + int err, retries = 0; + if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; @@ -2249,47 +2429,9 @@ retry: inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; - inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; - if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) { - if (!err) { - err = -EIO; - ext4_error(inode->i_sb, - "Directory hole detected on inode %lu\n", - inode->i_ino); - } - goto out_clear_inode; - } - BUFFER_TRACE(dir_block, "get_write_access"); - err = ext4_journal_get_write_access(handle, dir_block); - if (err) - goto out_clear_inode; - de = (struct ext4_dir_entry_2 *) dir_block->b_data; - de->inode = cpu_to_le32(inode->i_ino); - de->name_len = 1; - de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), - blocksize); - strcpy(de->name, "."); - ext4_set_de_type(dir->i_sb, de, S_IFDIR); - de = ext4_next_entry(de, blocksize); - de->inode = cpu_to_le32(dir->i_ino); - de->rec_len = ext4_rec_len_to_disk(blocksize - - (csum_size + EXT4_DIR_REC_LEN(1)), - blocksize); - de->name_len = 2; - strcpy(de->name, ".."); - ext4_set_de_type(dir->i_sb, de, S_IFDIR); - set_nlink(inode, 2); - - if (csum_size) { - t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); - initialize_dirent_tail(t, blocksize); - } - - BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_dirent_node(handle, inode, dir_block); + err = ext4_init_new_dir(handle, dir, inode); if (err) goto out_clear_inode; - set_buffer_verified(dir_block); err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); @@ -2309,7 +2451,6 @@ out_clear_inode: unlock_new_inode(inode); d_instantiate(dentry, inode); out_stop: - brelse(dir_block); ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; @@ -2327,6 +2468,14 @@ static int empty_dir(struct inode *inode) struct super_block *sb; int err = 0; + if (ext4_has_inline_data(inode)) { + int has_inline_data = 1; + + err = empty_inline_dir(inode, &has_inline_data); + if (has_inline_data) + return err; + } + sb = inode->i_sb; if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { @@ -2393,7 +2542,8 @@ static int empty_dir(struct inode *inode) set_buffer_verified(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; } - if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) { + if (ext4_check_dir_entry(inode, NULL, de, bh, + bh->b_data, bh->b_size, offset)) { de = (struct ext4_dir_entry_2 *)(bh->b_data + sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; @@ -2579,7 +2729,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) return PTR_ERR(handle); retval = -ENOENT; - bh = ext4_find_entry(dir, &dentry->d_name, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (!bh) goto end_rmdir; @@ -2644,7 +2794,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) ext4_handle_sync(handle); retval = -ENOENT; - bh = ext4_find_entry(dir, &dentry->d_name, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (!bh) goto end_unlink; @@ -2826,8 +2976,39 @@ retry: return err; } -#define PARENT_INO(buffer, size) \ - (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode) + +/* + * Try to find buffer head where contains the parent block. + * It should be the inode block if it is inlined or the 1st block + * if it is a normal dir. + */ +static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, + struct inode *inode, + int *retval, + struct ext4_dir_entry_2 **parent_de, + int *inlined) +{ + struct buffer_head *bh; + + if (!ext4_has_inline_data(inode)) { + if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) { + if (!*retval) { + *retval = -EIO; + ext4_error(inode->i_sb, + "Directory hole detected on inode %lu\n", + inode->i_ino); + } + return NULL; + } + *parent_de = ext4_next_entry( + (struct ext4_dir_entry_2 *)bh->b_data, + inode->i_sb->s_blocksize); + return bh; + } + + *inlined = 1; + return ext4_get_first_inline_block(inode, parent_de, retval); +} /* * Anybody can rename anything with this: the permission checks are left to the @@ -2841,6 +3022,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *old_bh, *new_bh, *dir_bh; struct ext4_dir_entry_2 *old_de, *new_de; int retval, force_da_alloc = 0; + int inlined = 0, new_inlined = 0; + struct ext4_dir_entry_2 *parent_de; dquot_initialize(old_dir); dquot_initialize(new_dir); @@ -2860,7 +3043,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) ext4_handle_sync(handle); - old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); + old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process @@ -2873,7 +3056,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; new_inode = new_dentry->d_inode; - new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); + new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, + &new_de, &new_inlined); if (new_bh) { if (!new_inode) { brelse(new_bh); @@ -2887,22 +3071,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; } retval = -EIO; - if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) { - if (!retval) { - retval = -EIO; - ext4_error(old_inode->i_sb, - "Directory hole detected on inode %lu\n", - old_inode->i_ino); - } + dir_bh = ext4_get_first_dir_block(handle, old_inode, + &retval, &parent_de, + &inlined); + if (!dir_bh) goto end_rename; - } - if (!buffer_verified(dir_bh) && + if (!inlined && !buffer_verified(dir_bh) && !ext4_dirent_csum_verify(old_inode, (struct ext4_dir_entry *)dir_bh->b_data)) goto end_rename; set_buffer_verified(dir_bh); - if (le32_to_cpu(PARENT_INO(dir_bh->b_data, - old_dir->i_sb->s_blocksize)) != old_dir->i_ino) + if (le32_to_cpu(parent_de->inode) != old_dir->i_ino) goto end_rename; retval = -EMLINK; if (!new_inode && new_dir != old_dir && @@ -2931,10 +3110,13 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_current_time(new_dir); ext4_mark_inode_dirty(handle, new_dir); BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); - retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh); - if (unlikely(retval)) { - ext4_std_error(new_dir->i_sb, retval); - goto end_rename; + if (!new_inlined) { + retval = ext4_handle_dirty_dirent_node(handle, + new_dir, new_bh); + if (unlikely(retval)) { + ext4_std_error(new_dir->i_sb, retval); + goto end_rename; + } } brelse(new_bh); new_bh = NULL; @@ -2962,7 +3144,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *old_bh2; struct ext4_dir_entry_2 *old_de2; - old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); + old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, + &old_de2, NULL); if (old_bh2) { retval = ext4_delete_entry(handle, old_dir, old_de2, old_bh2); @@ -2982,17 +3165,19 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); ext4_update_dx_flag(old_dir); if (dir_bh) { - PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = - cpu_to_le32(new_dir->i_ino); + parent_de->inode = cpu_to_le32(new_dir->i_ino); BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); - if (is_dx(old_inode)) { - retval = ext4_handle_dirty_dx_node(handle, - old_inode, - dir_bh); + if (!inlined) { + if (is_dx(old_inode)) { + retval = ext4_handle_dirty_dx_node(handle, + old_inode, + dir_bh); + } else { + retval = ext4_handle_dirty_dirent_node(handle, + old_inode, dir_bh); + } } else { - retval = ext4_handle_dirty_dirent_node(handle, - old_inode, - dir_bh); + retval = ext4_mark_inode_dirty(handle, old_inode); } if (retval) { ext4_std_error(old_dir->i_sb, retval); @@ -3043,23 +3228,19 @@ const struct inode_operations ext4_dir_inode_operations = { .mknod = ext4_mknod, .rename = ext4_rename, .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif .get_acl = ext4_get_acl, .fiemap = ext4_fiemap, }; const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif .get_acl = ext4_get_acl, }; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 68e896e12a6..0016fbca2a4 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -27,7 +27,6 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "ext4_extents.h" static struct kmem_cache *io_page_cachep, *io_end_cachep; @@ -111,7 +110,7 @@ static int ext4_end_io(ext4_io_end_t *io) inode_dio_done(inode); /* Wake up anyone waiting on unwritten extent conversion */ if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(io->inode)); + wake_up_all(ext4_ioend_wq(inode)); return ret; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 47bf06a2765..d99387b89ed 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -783,7 +783,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) - goto exit_sbh; + goto exit_dind; err = ext4_journal_get_write_access(handle, dind); if (unlikely(err)) @@ -792,7 +792,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, /* ext4_reserve_inode_write() gets a reference on the iloc */ err = ext4_reserve_inode_write(handle, inode, &iloc); if (unlikely(err)) - goto exit_dindj; + goto exit_dind; n_group_desc = ext4_kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), @@ -846,12 +846,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, exit_inode: ext4_kvfree(n_group_desc); - /* ext4_handle_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); -exit_dindj: - /* ext4_handle_release_buffer(handle, dind); */ -exit_sbh: - /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ exit_dind: brelse(dind); exit_bh: @@ -969,14 +964,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, } for (i = 0; i < reserved_gdb; i++) { - if ((err = ext4_journal_get_write_access(handle, primary[i]))) { - /* - int j; - for (j = 0; j < i; j++) - ext4_handle_release_buffer(handle, primary[j]); - */ + if ((err = ext4_journal_get_write_access(handle, primary[i]))) goto exit_bh; - } } if ((err = ext4_reserve_inode_write(handle, inode, &iloc))) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 80928f71685..3cdb0a2fc64 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -45,7 +45,7 @@ #include <linux/freezer.h> #include "ext4.h" -#include "ext4_extents.h" +#include "ext4_extents.h" /* Needed for trace points definition */ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" @@ -939,10 +939,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) return NULL; ei->vfs_inode.i_version = 1; - ei->vfs_inode.i_data.writeback_index = 0; memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); + ext4_es_init_tree(&ei->i_es_tree); + rwlock_init(&ei->i_es_lock); ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; @@ -996,9 +997,7 @@ static void init_once(void *foo) struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; INIT_LIST_HEAD(&ei->i_orphan); -#ifdef CONFIG_EXT4_FS_XATTR init_rwsem(&ei->xattr_sem); -#endif init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); } @@ -1031,6 +1030,7 @@ void ext4_clear_inode(struct inode *inode) clear_inode(inode); dquot_drop(inode); ext4_discard_preallocations(inode); + ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -1447,13 +1447,8 @@ static const struct mount_opts { {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ}, {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ}, {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ}, -#ifdef CONFIG_EXT4_FS_XATTR {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR}, -#else - {Opt_user_xattr, 0, MOPT_NOSUPPORT}, - {Opt_nouser_xattr, 0, MOPT_NOSUPPORT}, -#endif #ifdef CONFIG_EXT4_FS_POSIX_ACL {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR}, @@ -3202,7 +3197,6 @@ int ext4_calculate_overhead(struct super_block *sb) ext4_fsblk_t overhead = 0; char *buf = (char *) get_zeroed_page(GFP_KERNEL); - memset(buf, 0, PAGE_SIZE); if (!buf) return -ENOMEM; @@ -3256,7 +3250,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned int i; int needs_recovery, has_huge_files, has_bigalloc; __u64 blocks_count; - int err; + int err = 0; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ext4_group_t first_not_zeroed; @@ -3272,9 +3266,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sb->s_fs_info = sbi; sbi->s_sb = sb; - sbi->s_mount_opt = 0; - sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID); - sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID); sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; if (sb->s_bdev->bd_part) @@ -3285,6 +3276,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (cp = sb->s_id; (cp = strchr(cp, '/'));) *cp = '!'; + /* -EINVAL is default */ ret = -EINVAL; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { @@ -3369,9 +3361,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ -#ifdef CONFIG_EXT4_FS_XATTR set_opt(sb, XATTR_USER); -#endif #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif @@ -3662,7 +3652,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); - ret = err; goto failed_mount; } @@ -3770,7 +3759,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); - ret = err; goto failed_mount3; } @@ -3801,7 +3789,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); - sbi->s_resize_flags = 0; sb->s_root = NULL; @@ -3897,8 +3884,8 @@ no_journal: if (es->s_overhead_clusters) sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); else { - ret = ext4_calculate_overhead(sb); - if (ret) + err = ext4_calculate_overhead(sb); + if (err) goto failed_mount_wq; } @@ -3910,6 +3897,7 @@ no_journal: alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); + ret = -ENOMEM; goto failed_mount_wq; } @@ -4012,12 +4000,20 @@ no_journal: /* Enable quota usage during mount. */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && !(sb->s_flags & MS_RDONLY)) { - ret = ext4_enable_quotas(sb); - if (ret) + err = ext4_enable_quotas(sb); + if (err) goto failed_mount7; } #endif /* CONFIG_QUOTA */ + if (test_opt(sb, DISCARD)) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + if (!blk_queue_discard(q)) + ext4_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but " + "the device does not support discard"); + } + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, *sbi->s_es->s_mount_opts ? "; " : "", orig_data); @@ -4084,7 +4080,7 @@ out_fail: kfree(sbi); out_free_orig: kfree(orig_data); - return ret; + return err ? err : ret; } /* @@ -4790,7 +4786,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, sbi->s_overhead); + buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead); bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ @@ -5282,6 +5278,7 @@ static int __init ext4_init_fs(void) ext4_li_info = NULL; mutex_init(&ext4_li_mtx); + /* Build-time check for flags consistency */ ext4_check_flag_values(); for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { @@ -5289,9 +5286,14 @@ static int __init ext4_init_fs(void) init_waitqueue_head(&ext4__ioend_wq[i]); } - err = ext4_init_pageio(); + err = ext4_init_es(); if (err) return err; + + err = ext4_init_pageio(); + if (err) + goto out7; + err = ext4_init_system_zone(); if (err) goto out6; @@ -5341,6 +5343,9 @@ out5: ext4_exit_system_zone(); out6: ext4_exit_pageio(); +out7: + ext4_exit_es(); + return err; } diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index ed9354aff27..ff371193201 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -35,22 +35,18 @@ const struct inode_operations ext4_symlink_inode_operations = { .follow_link = page_follow_link_light, .put_link = page_put_link, .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif }; const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ext4_follow_link, .setattr = ext4_setattr, -#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, .removexattr = generic_removexattr, -#endif }; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 2cdb98d6298..3a91ebc2b66 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -61,11 +61,6 @@ #include "xattr.h" #include "acl.h" -#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) -#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) -#define BFIRST(bh) ENTRY(BHDR(bh)+1) -#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) - #ifdef EXT4_XATTR_DEBUG # define ea_idebug(inode, f...) do { \ printk(KERN_DEBUG "inode %s:%lu: ", \ @@ -312,7 +307,7 @@ cleanup: return error; } -static int +int ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { @@ -581,21 +576,6 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, return (*min_offs - ((void *)last - base) - sizeof(__u32)); } -struct ext4_xattr_info { - int name_index; - const char *name; - const void *value; - size_t value_len; -}; - -struct ext4_xattr_search { - struct ext4_xattr_entry *first; - void *base; - void *end; - struct ext4_xattr_entry *here; - int not_found; -}; - static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) { @@ -648,9 +628,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) size. Just replace. */ s->here->e_value_size = cpu_to_le32(i->value_len); - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); /* Clear pad bytes. */ - memcpy(val, i->value, i->value_len); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, size); + } else { + /* Clear pad bytes first. */ + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); + memcpy(val, i->value, i->value_len); + } return 0; } @@ -689,9 +674,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) size_t size = EXT4_XATTR_SIZE(i->value_len); void *val = s->base + min_offs - size; s->here->e_value_offs = cpu_to_le16(min_offs - size); - memset(val + size - EXT4_XATTR_PAD, 0, - EXT4_XATTR_PAD); /* Clear the pad bytes. */ - memcpy(val, i->value, i->value_len); + if (i->value == EXT4_ZERO_XATTR_VALUE) { + memset(val, 0, size); + } else { + /* Clear the pad bytes first. */ + memset(val + size - EXT4_XATTR_PAD, 0, + EXT4_XATTR_PAD); + memcpy(val, i->value, i->value_len); + } } } return 0; @@ -794,7 +784,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, int offset = (char *)s->here - bs->bh->b_data; unlock_buffer(bs->bh); - ext4_handle_release_buffer(handle, bs->bh); if (ce) { mb_cache_entry_release(ce); ce = NULL; @@ -950,14 +939,8 @@ bad_block: #undef header } -struct ext4_xattr_ibody_find { - struct ext4_xattr_search s; - struct ext4_iloc iloc; -}; - -static int -ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) +int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; @@ -985,10 +968,47 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, return 0; } -static int -ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) +int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) +{ + struct ext4_xattr_ibody_header *header; + struct ext4_xattr_search *s = &is->s; + int error; + + if (EXT4_I(inode)->i_extra_isize == 0) + return -ENOSPC; + error = ext4_xattr_set_entry(i, s); + if (error) { + if (error == -ENOSPC && + ext4_has_inline_data(inode)) { + error = ext4_try_to_evict_inline_data(handle, inode, + EXT4_XATTR_LEN(strlen(i->name) + + EXT4_XATTR_SIZE(i->value_len))); + if (error) + return error; + error = ext4_xattr_ibody_find(inode, i, is); + if (error) + return error; + error = ext4_xattr_set_entry(i, s); + } + if (error) + return error; + } + header = IHDR(inode, ext4_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + } else { + header->h_magic = cpu_to_le32(0); + ext4_clear_inode_state(inode, EXT4_STATE_XATTR); + } + return 0; +} + +static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; @@ -1144,9 +1164,17 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name, { handle_t *handle; int error, retries = 0; + int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); retry: - handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); + /* + * In case of inline data, we may push out the data to a block, + * So reserve the journal space first. + */ + if (ext4_has_inline_data(inode)) + credits += ext4_writepage_trans_blocks(inode) + 1; + + handle = ext4_journal_start(inode, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); } else { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 91f31ca7d9a..69eda787a96 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -21,6 +21,7 @@ #define EXT4_XATTR_INDEX_TRUSTED 4 #define EXT4_XATTR_INDEX_LUSTRE 5 #define EXT4_XATTR_INDEX_SECURITY 6 +#define EXT4_XATTR_INDEX_SYSTEM 7 struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ @@ -65,7 +66,32 @@ struct ext4_xattr_entry { EXT4_I(inode)->i_extra_isize)) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) -# ifdef CONFIG_EXT4_FS_XATTR +#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) +#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) +#define BFIRST(bh) ENTRY(BHDR(bh)+1) +#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) + +#define EXT4_ZERO_XATTR_VALUE ((void *)-1) + +struct ext4_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ext4_xattr_search { + struct ext4_xattr_entry *first; + void *base; + void *end; + struct ext4_xattr_entry *here; + int not_found; +}; + +struct ext4_xattr_ibody_find { + struct ext4_xattr_search s; + struct ext4_iloc iloc; +}; extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; @@ -90,60 +116,82 @@ extern void ext4_exit_xattr(void); extern const struct xattr_handler *ext4_xattr_handlers[]; -# else /* CONFIG_EXT4_FS_XATTR */ - -static inline int -ext4_xattr_get(struct inode *inode, int name_index, const char *name, - void *buffer, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline int -ext4_xattr_set(struct inode *inode, int name_index, const char *name, - const void *value, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline int -ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, - const char *name, const void *value, size_t size, int flags) -{ - return -EOPNOTSUPP; -} - -static inline void -ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) -{ -} - -static inline void -ext4_xattr_put_super(struct super_block *sb) -{ -} - -static __init inline int -ext4_init_xattr(void) -{ - return 0; -} - -static inline void -ext4_exit_xattr(void) -{ -} - -static inline int -ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, - struct ext4_inode *raw_inode, handle_t *handle) -{ - return -EOPNOTSUPP; -} - -#define ext4_xattr_handlers NULL - -# endif /* CONFIG_EXT4_FS_XATTR */ +extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); +extern int ext4_xattr_ibody_get(struct inode *inode, int name_index, + const char *name, + void *buffer, size_t buffer_size); +extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); + +extern int ext4_has_inline_data(struct inode *inode); +extern int ext4_get_inline_size(struct inode *inode); +extern int ext4_get_max_inline_size(struct inode *inode); +extern int ext4_find_inline_data_nolock(struct inode *inode); +extern void ext4_write_inline_data(struct inode *inode, + struct ext4_iloc *iloc, + void *buffer, loff_t pos, + unsigned int len); +extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, + unsigned int len); +extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); + +extern int ext4_readpage_inline(struct inode *inode, struct page *page); +extern int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep); +extern int ext4_write_inline_data_end(struct inode *inode, + loff_t pos, unsigned len, + unsigned copied, + struct page *page); +extern struct buffer_head * +ext4_journalled_write_inline_data(struct inode *inode, + unsigned len, + struct page *page); +extern int ext4_da_write_inline_data_begin(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata); +extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, + unsigned len, unsigned copied, + struct page *page); +extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); +extern int ext4_try_create_inline_dir(handle_t *handle, + struct inode *parent, + struct inode *inode); +extern int ext4_read_inline_dir(struct file *filp, + void *dirent, filldir_t filldir, + int *has_inline_data); +extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *has_inline_data); +extern int ext4_delete_inline_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh, + int *has_inline_data); +extern int empty_inline_dir(struct inode *dir, int *has_inline_data); +extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, + struct ext4_dir_entry_2 **parent_de, + int *retval); +extern int ext4_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + int *has_inline); +extern int ext4_try_to_evict_inline_data(handle_t *handle, + struct inode *inode, + int needed); +extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline); + +extern int ext4_convert_inline_data(struct inode *inode); #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 484b8d1c6cb..dbf41f9452d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -60,7 +60,6 @@ EXPORT_SYMBOL(jbd2_journal_get_create_access); EXPORT_SYMBOL(jbd2_journal_get_undo_access); EXPORT_SYMBOL(jbd2_journal_set_triggers); EXPORT_SYMBOL(jbd2_journal_dirty_metadata); -EXPORT_SYMBOL(jbd2_journal_release_buffer); EXPORT_SYMBOL(jbd2_journal_forget); #if 0 EXPORT_SYMBOL(journal_sync_buffer); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d8da40e99d8..42f6615af0a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1207,17 +1207,6 @@ out: return ret; } -/* - * jbd2_journal_release_buffer: undo a get_write_access without any buffer - * updates, if the update decided in the end that it didn't need access. - * - */ -void -jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh) -{ - BUFFER_TRACE(bh, "entry"); -} - /** * void jbd2_journal_forget() - bforget() for potentially-journaled buffers. * @handle: transaction handle diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 9cc4a3fbf4b..bc3968fa81e 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -193,19 +193,15 @@ static int nfs_idmap_init_keyring(void) if (!cred) return -ENOMEM; - keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA); + keyring = keyring_alloc(".id_resolver", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; } - ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); - if (ret < 0) - goto failed_put_key; - ret = register_key_type(&key_type_id_resolver); if (ret < 0) goto failed_put_key; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 284e80831d2..701beab27aa 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -219,6 +219,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) #define move_pte(pte, prot, old_addr, new_addr) (pte) #endif +#ifndef pte_accessible +# define pte_accessible(pte) ((void)(pte),1) +#endif + #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) #endif @@ -580,6 +584,112 @@ static inline int pmd_trans_unstable(pmd_t *pmd) #endif } +#ifdef CONFIG_NUMA_BALANCING +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +/* + * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the + * same bit too). It's set only when _PAGE_PRESET is not set and it's + * never set if _PAGE_PRESENT is set. + * + * pte/pmd_present() returns true if pte/pmd_numa returns true. Page + * fault triggers on those regions if pte/pmd_numa returns true + * (because _PAGE_PRESENT is not set). + */ +#ifndef pte_numa +static inline int pte_numa(pte_t pte) +{ + return (pte_flags(pte) & + (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; +} +#endif + +#ifndef pmd_numa +static inline int pmd_numa(pmd_t pmd) +{ + return (pmd_flags(pmd) & + (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; +} +#endif + +/* + * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically + * because they're called by the NUMA hinting minor page fault. If we + * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler + * would be forced to set it later while filling the TLB after we + * return to userland. That would trigger a second write to memory + * that we optimize away by setting _PAGE_ACCESSED here. + */ +#ifndef pte_mknonnuma +static inline pte_t pte_mknonnuma(pte_t pte) +{ + pte = pte_clear_flags(pte, _PAGE_NUMA); + return pte_set_flags(pte, _PAGE_PRESENT|_PAGE_ACCESSED); +} +#endif + +#ifndef pmd_mknonnuma +static inline pmd_t pmd_mknonnuma(pmd_t pmd) +{ + pmd = pmd_clear_flags(pmd, _PAGE_NUMA); + return pmd_set_flags(pmd, _PAGE_PRESENT|_PAGE_ACCESSED); +} +#endif + +#ifndef pte_mknuma +static inline pte_t pte_mknuma(pte_t pte) +{ + pte = pte_set_flags(pte, _PAGE_NUMA); + return pte_clear_flags(pte, _PAGE_PRESENT); +} +#endif + +#ifndef pmd_mknuma +static inline pmd_t pmd_mknuma(pmd_t pmd) +{ + pmd = pmd_set_flags(pmd, _PAGE_NUMA); + return pmd_clear_flags(pmd, _PAGE_PRESENT); +} +#endif +#else +extern int pte_numa(pte_t pte); +extern int pmd_numa(pmd_t pmd); +extern pte_t pte_mknonnuma(pte_t pte); +extern pmd_t pmd_mknonnuma(pmd_t pmd); +extern pte_t pte_mknuma(pte_t pte); +extern pmd_t pmd_mknuma(pmd_t pmd); +#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ +#else +static inline int pmd_numa(pmd_t pmd) +{ + return 0; +} + +static inline int pte_numa(pte_t pte) +{ + return 0; +} + +static inline pte_t pte_mknonnuma(pte_t pte) +{ + return pte; +} + +static inline pmd_t pmd_mknonnuma(pmd_t pmd) +{ + return pmd; +} + +static inline pte_t pte_mknuma(pte_t pte) +{ + return pte; +} + +static inline pmd_t pmd_mknuma(pmd_t pmd) +{ + return pmd; +} +#endif /* CONFIG_NUMA_BALANCING */ + #endif /* CONFIG_MMU */ #endif /* !__ASSEMBLY__ */ diff --git a/include/linux/cred.h b/include/linux/cred.h index ebbed2ce663..0142aacb70b 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -77,21 +77,6 @@ extern int in_group_p(kgid_t); extern int in_egroup_p(kgid_t); /* - * The common credentials for a thread group - * - shared by CLONE_THREAD - */ -#ifdef CONFIG_KEYS -struct thread_group_cred { - atomic_t usage; - pid_t tgid; /* thread group process ID */ - spinlock_t lock; - struct key __rcu *session_keyring; /* keyring inherited over fork */ - struct key *process_keyring; /* keyring private to this process */ - struct rcu_head rcu; /* RCU deletion hook */ -}; -#endif - -/* * The security context of a task * * The parts of the context break down into two categories: @@ -139,6 +124,8 @@ struct cred { #ifdef CONFIG_KEYS unsigned char jit_keyring; /* default keyring to attach requested * keys to */ + struct key __rcu *session_keyring; /* keyring inherited over fork */ + struct key *process_keyring; /* keyring private to this process */ struct key *thread_keyring; /* keyring private to this thread */ struct key *request_key_auth; /* assumed request_key authority */ struct thread_group_cred *tgcred; /* thread-group shared credentials */ diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 092dc5305a3..1d76f8ca90f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -31,7 +31,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot); + unsigned long addr, pgprot_t newprot, + int prot_numa); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, @@ -111,7 +112,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma, #define wait_split_huge_page(__anon_vma, __pmd) \ do { \ pmd_t *____pmd = (__pmd); \ - anon_vma_lock(__anon_vma); \ + anon_vma_lock_write(__anon_vma); \ anon_vma_unlock(__anon_vma); \ BUG_ON(pmd_trans_splitting(*____pmd) || \ pmd_trans_huge(*____pmd)); \ @@ -171,6 +172,10 @@ static inline struct page *compound_trans_head(struct page *page) } return page; } + +extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp); + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) @@ -209,6 +214,13 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, { return 0; } + +static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp) +{ + return 0; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3e7fa1acf09..0c80d3f57a5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -87,7 +87,7 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int write); int pmd_huge(pmd_t pmd); int pud_huge(pud_t pmd); -void hugetlb_change_protection(struct vm_area_struct *vma, +unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); #else /* !CONFIG_HUGETLB_PAGE */ @@ -132,7 +132,11 @@ static inline void copy_huge_page(struct page *dst, struct page *src) { } -#define hugetlb_change_protection(vma, address, end, newprot) +static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, pgprot_t newprot) +{ + return 0; +} static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 3efc43f3f16..1be23d9fdac 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1096,7 +1096,6 @@ extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); void jbd2_journal_set_triggers(struct buffer_head *, struct jbd2_buffer_trigger_type *type); extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); -extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *); extern int jbd2_journal_forget (handle_t *, struct buffer_head *); extern void journal_sync_buffer (struct buffer_head *); extern void jbd2_journal_invalidatepage(journal_t *, @@ -1303,15 +1302,21 @@ static inline int jbd_space_needed(journal_t *journal) extern int jbd_blocks_per_page(struct inode *inode); +/* JBD uses a CRC32 checksum */ +#define JBD_MAX_CHECKSUM_SIZE 4 + static inline u32 jbd2_chksum(journal_t *journal, u32 crc, const void *address, unsigned int length) { struct { struct shash_desc shash; - char ctx[crypto_shash_descsize(journal->j_chksum_driver)]; + char ctx[JBD_MAX_CHECKSUM_SIZE]; } desc; int err; + BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) > + JBD_MAX_CHECKSUM_SIZE); + desc.shash.tfm = journal->j_chksum_driver; desc.shash.flags = 0; *(u32 *)desc.ctx = crc; diff --git a/include/linux/key.h b/include/linux/key.h index 2393b1c040b..4dfde1161c5 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -265,6 +265,7 @@ extern int key_unlink(struct key *keyring, extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, const struct cred *cred, + key_perm_t perm, unsigned long flags, struct key *dest); diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index dbd212723b7..9adc270de7e 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -188,6 +188,8 @@ static inline int vma_migratable(struct vm_area_struct *vma) return 1; } +extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); + #else struct mempolicy {}; @@ -307,5 +309,11 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, return 0; } +static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, + unsigned long address) +{ + return -1; /* no node preference */ +} + #endif /* CONFIG_NUMA */ #endif diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 0b5865c61ef..1e9f627967a 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -23,6 +23,15 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); #define MIGRATEPAGE_BALLOON_SUCCESS 1 /* special ret code for balloon page * sucessful migration case. */ +enum migrate_reason { + MR_COMPACTION, + MR_MEMORY_FAILURE, + MR_MEMORY_HOTPLUG, + MR_SYSCALL, /* also applies to cpusets */ + MR_MEMPOLICY_MBIND, + MR_NUMA_MISPLACED, + MR_CMA +}; #ifdef CONFIG_MIGRATION @@ -32,7 +41,7 @@ extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - enum migrate_mode mode); + enum migrate_mode mode, int reason); extern int migrate_huge_page(struct page *, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode); @@ -54,7 +63,7 @@ static inline void putback_lru_pages(struct list_head *l) {} static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - enum migrate_mode mode) { return -ENOSYS; } + enum migrate_mode mode, int reason) { return -ENOSYS; } static inline int migrate_huge_page(struct page *page, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode) { return -ENOSYS; } @@ -83,4 +92,37 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #define fail_migrate_page NULL #endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_NUMA_BALANCING +extern int migrate_misplaced_page(struct page *page, int node); +extern int migrate_misplaced_page(struct page *page, int node); +extern bool migrate_ratelimited(int node); +#else +static inline int migrate_misplaced_page(struct page *page, int node) +{ + return -EAGAIN; /* can't migrate now */ +} +static inline bool migrate_ratelimited(int node) +{ + return false; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +extern int migrate_misplaced_transhuge_page(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, pmd_t entry, + unsigned long address, + struct page *page, int node); +#else +static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, pmd_t entry, + unsigned long address, + struct page *page, int node) +{ + return -EAGAIN; +} +#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/ + #endif /* _LINUX_MIGRATE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 4af4f0b1be4..7f4f906190b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -693,6 +693,36 @@ static inline int page_to_nid(const struct page *page) } #endif +#ifdef CONFIG_NUMA_BALANCING +static inline int page_xchg_last_nid(struct page *page, int nid) +{ + return xchg(&page->_last_nid, nid); +} + +static inline int page_last_nid(struct page *page) +{ + return page->_last_nid; +} +static inline void reset_page_last_nid(struct page *page) +{ + page->_last_nid = -1; +} +#else +static inline int page_xchg_last_nid(struct page *page, int nid) +{ + return page_to_nid(page); +} + +static inline int page_last_nid(struct page *page) +{ + return page_to_nid(page); +} + +static inline void reset_page_last_nid(struct page *page) +{ +} +#endif + static inline struct zone *page_zone(const struct page *page) { return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; @@ -1078,6 +1108,9 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, extern unsigned long do_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr); +extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot, + int dirty_accountable, int prot_numa); extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); @@ -1579,6 +1612,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) } #endif +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +#endif + struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); @@ -1600,6 +1638,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_MLOCK 0x40 /* mark page as mlocked */ #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ +#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7ade2731b5d..7d9ebb7cc98 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -175,6 +175,10 @@ struct page { */ void *shadow; #endif + +#ifdef CONFIG_NUMA_BALANCING + int _last_nid; +#endif } /* * The struct page can be forced to be double word aligned so that atomic ops @@ -411,9 +415,36 @@ struct mm_struct { #ifdef CONFIG_CPUMASK_OFFSTACK struct cpumask cpumask_allocation; #endif +#ifdef CONFIG_NUMA_BALANCING + /* + * numa_next_scan is the next time when the PTEs will me marked + * pte_numa to gather statistics and migrate pages to new nodes + * if necessary + */ + unsigned long numa_next_scan; + + /* numa_next_reset is when the PTE scanner period will be reset */ + unsigned long numa_next_reset; + + /* Restart point for scanning and setting pte_numa */ + unsigned long numa_scan_offset; + + /* numa_scan_seq prevents two threads setting pte_numa */ + int numa_scan_seq; + + /* + * The first node a task was scheduled on. If a task runs on + * a different node than Make PTE Scan Go Now. + */ + int first_nid; +#endif struct uprobes_state uprobes_state; }; +/* first nid will either be a valid NID or one of these values */ +#define NUMA_PTE_SCAN_INIT -1 +#define NUMA_PTE_SCAN_ACTIVE -2 + static inline void mm_init_cpumask(struct mm_struct *mm) { #ifdef CONFIG_CPUMASK_OFFSTACK diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cd55dad56aa..4bec5be82ca 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -735,6 +735,19 @@ typedef struct pglist_data { struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ int kswapd_max_order; enum zone_type classzone_idx; +#ifdef CONFIG_NUMA_BALANCING + /* + * Lock serializing the per destination node AutoNUMA memory + * migration rate limiting data. + */ + spinlock_t numabalancing_migrate_lock; + + /* Rate limiting time interval */ + unsigned long numabalancing_migrate_next_window; + + /* Number of pages migrated during the rate limiting time interval */ + unsigned long numabalancing_migrate_nr_pages; +#endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bfe1f478064..c20635c527a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -7,7 +7,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/mm.h> -#include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/memcontrol.h> /* @@ -25,8 +25,8 @@ * pointing to this anon_vma once its vma list is empty. */ struct anon_vma { - struct anon_vma *root; /* Root of this anon_vma tree */ - struct mutex mutex; /* Serialize access to vma list */ + struct anon_vma *root; /* Root of this anon_vma tree */ + struct rw_semaphore rwsem; /* W: modification, R: walking the list */ /* * The refcount is taken on an anon_vma when there is no * guarantee that the vma of page tables will exist for @@ -64,7 +64,7 @@ struct anon_vma_chain { struct vm_area_struct *vma; struct anon_vma *anon_vma; struct list_head same_vma; /* locked by mmap_sem & page_table_lock */ - struct rb_node rb; /* locked by anon_vma->mutex */ + struct rb_node rb; /* locked by anon_vma->rwsem */ unsigned long rb_subtree_last; #ifdef CONFIG_DEBUG_VM_RB unsigned long cached_vma_start, cached_vma_last; @@ -108,26 +108,37 @@ static inline void vma_lock_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - mutex_lock(&anon_vma->root->mutex); + down_write(&anon_vma->root->rwsem); } static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - mutex_unlock(&anon_vma->root->mutex); + up_write(&anon_vma->root->rwsem); } -static inline void anon_vma_lock(struct anon_vma *anon_vma) +static inline void anon_vma_lock_write(struct anon_vma *anon_vma) { - mutex_lock(&anon_vma->root->mutex); + down_write(&anon_vma->root->rwsem); } static inline void anon_vma_unlock(struct anon_vma *anon_vma) { - mutex_unlock(&anon_vma->root->mutex); + up_write(&anon_vma->root->rwsem); } +static inline void anon_vma_lock_read(struct anon_vma *anon_vma) +{ + down_read(&anon_vma->root->rwsem); +} + +static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) +{ + up_read(&anon_vma->root->rwsem); +} + + /* * anon_vma helper functions. */ @@ -220,8 +231,8 @@ int try_to_munlock(struct page *); /* * Called by memory-failure.c to kill processes. */ -struct anon_vma *page_lock_anon_vma(struct page *page); -void page_unlock_anon_vma(struct anon_vma *anon_vma); +struct anon_vma *page_lock_anon_vma_read(struct page *page); +void page_unlock_anon_vma_read(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c2f3072bee..b089c92c609 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1527,6 +1527,14 @@ struct task_struct { short il_next; short pref_node_fork; #endif +#ifdef CONFIG_NUMA_BALANCING + int numa_scan_seq; + int numa_migrate_seq; + unsigned int numa_scan_period; + u64 node_stamp; /* migration stamp */ + struct callback_head numa_work; +#endif /* CONFIG_NUMA_BALANCING */ + struct rcu_head rcu; /* @@ -1601,6 +1609,18 @@ struct task_struct { /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +#ifdef CONFIG_NUMA_BALANCING +extern void task_numa_fault(int node, int pages, bool migrated); +extern void set_numabalancing_state(bool enabled); +#else +static inline void task_numa_fault(int node, int pages, bool migrated) +{ +} +static inline void set_numabalancing_state(bool enabled) +{ +} +#endif + /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH @@ -2030,6 +2050,13 @@ enum sched_tunable_scaling { }; extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; +extern unsigned int sysctl_numa_balancing_scan_delay; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_period_reset; +extern unsigned int sysctl_numa_balancing_scan_size; +extern unsigned int sysctl_numa_balancing_settle_count; + #ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 8d08b3ed406..071d62c214a 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -34,21 +34,25 @@ enum dma_sync_target { SYNC_FOR_CPU = 0, SYNC_FOR_DEVICE = 1, }; -extern void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, - phys_addr_t phys, size_t size, - enum dma_data_direction dir); -extern void swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, +/* define the last possible byte of physical address space as a mapping error */ +#define SWIOTLB_MAP_ERROR (~(phys_addr_t)0x0) + +extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, + dma_addr_t tbl_dma_addr, + phys_addr_t phys, size_t size, + enum dma_data_direction dir); + +extern void swiotlb_tbl_unmap_single(struct device *hwdev, + phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir); -extern void swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, +extern void swiotlb_tbl_sync_single(struct device *hwdev, + phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir, enum dma_sync_target target); /* Accessory functions. */ -extern void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, - enum dma_data_direction dir); - extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags); diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index fe786f07d2b..fce0a2799d4 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -38,8 +38,18 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, KSWAPD_SKIP_CONGESTION_WAIT, PAGEOUTRUN, ALLOCSTALL, PGROTATED, +#ifdef CONFIG_NUMA_BALANCING + NUMA_PTE_UPDATES, + NUMA_HINT_FAULTS, + NUMA_HINT_FAULTS_LOCAL, + NUMA_PAGE_MIGRATE, +#endif +#ifdef CONFIG_MIGRATION + PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, +#endif #ifdef CONFIG_COMPACTION - COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED, + COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED, + COMPACTISOLATED, COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, #endif #ifdef CONFIG_HUGETLB_PAGE diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 92a86b2cce3..a13291f7da8 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -80,6 +80,14 @@ static inline void vm_events_fold_cpu(int cpu) #endif /* CONFIG_VM_EVENT_COUNTERS */ +#ifdef CONFIG_NUMA_BALANCING +#define count_vm_numa_event(x) count_vm_event(x) +#define count_vm_numa_events(x, y) count_vm_events(x, y) +#else +#define count_vm_numa_event(x) do {} while (0) +#define count_vm_numa_events(x, y) do {} while (0) +#endif /* CONFIG_NUMA_BALANCING */ + #define __count_zone_vm_events(item, zone, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ zone_idx(zone), delta) diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index d49b285385e..f6372b01136 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -15,6 +15,7 @@ struct ext4_inode_info; struct mpage_da_data; struct ext4_map_blocks; struct ext4_extent; +struct extent_status; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) @@ -1519,10 +1520,9 @@ DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter, ); DECLARE_EVENT_CLASS(ext4__map_blocks_exit, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, - ext4_fsblk_t pblk, unsigned int len, int ret), + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), - TP_ARGS(inode, lblk, pblk, len, ret), + TP_ARGS(inode, map, ret), TP_STRUCT__entry( __field( dev_t, dev ) @@ -1530,37 +1530,37 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_exit, __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) + __field( unsigned int, flags ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->pblk = pblk; - __entry->lblk = lblk; - __entry->len = len; + __entry->pblk = map->m_pblk; + __entry->lblk = map->m_lblk; + __entry->len = map->m_len; + __entry->flags = map->m_flags; __entry->ret = ret; ), - TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d", + TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u flags %x ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->pblk, - __entry->len, __entry->ret) + __entry->len, __entry->flags, __entry->ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, - ext4_fsblk_t pblk, unsigned len, int ret), + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), - TP_ARGS(inode, lblk, pblk, len, ret) + TP_ARGS(inode, map, ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, - ext4_fsblk_t pblk, unsigned len, int ret), + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret), - TP_ARGS(inode, lblk, pblk, len, ret) + TP_ARGS(inode, map, ret) ); TRACE_EVENT(ext4_ext_load_extent, @@ -1680,10 +1680,10 @@ DEFINE_EVENT(ext4__trim, ext4_trim_all_free, ); TRACE_EVENT(ext4_ext_handle_uninitialized_extents, - TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, + TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags, unsigned int allocated, ext4_fsblk_t newblock), - TP_ARGS(inode, map, allocated, newblock), + TP_ARGS(inode, map, flags, allocated, newblock), TP_STRUCT__entry( __field( dev_t, dev ) @@ -1699,7 +1699,7 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->flags = map->m_flags; + __entry->flags = flags; __entry->lblk = map->m_lblk; __entry->pblk = map->m_pblk; __entry->len = map->m_len; @@ -1707,7 +1707,7 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents, __entry->newblk = newblock; ), - TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %d" + TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %x " "allocated %d newblock %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, @@ -2055,6 +2055,106 @@ TRACE_EVENT(ext4_ext_remove_space_done, (unsigned short) __entry->eh_entries) ); +TRACE_EVENT(ext4_es_insert_extent, + TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len), + + TP_ARGS(inode, start, len), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, start ) + __field( loff_t, len ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start = start; + __entry->len = len; + ), + + TP_printk("dev %d,%d ino %lu es [%lld/%lld)", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->start, __entry->len) +); + +TRACE_EVENT(ext4_es_remove_extent, + TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len), + + TP_ARGS(inode, start, len), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, start ) + __field( loff_t, len ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start = start; + __entry->len = len; + ), + + TP_printk("dev %d,%d ino %lu es [%lld/%lld)", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->start, __entry->len) +); + +TRACE_EVENT(ext4_es_find_extent_enter, + TP_PROTO(struct inode *inode, ext4_lblk_t start), + + TP_ARGS(inode, start), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ext4_lblk_t, start ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start = start; + ), + + TP_printk("dev %d,%d ino %lu start %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, __entry->start) +); + +TRACE_EVENT(ext4_es_find_extent_exit, + TP_PROTO(struct inode *inode, struct extent_status *es, + ext4_lblk_t ret), + + TP_ARGS(inode, es, ret), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ext4_lblk_t, start ) + __field( ext4_lblk_t, len ) + __field( ext4_lblk_t, ret ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start = es->start; + __entry->len = es->len; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %lu es [%u/%u) ret %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->start, __entry->len, __entry->ret) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h new file mode 100644 index 00000000000..ec2a6ccfd7e --- /dev/null +++ b/include/trace/events/migrate.h @@ -0,0 +1,51 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM migrate + +#if !defined(_TRACE_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MIGRATE_H + +#define MIGRATE_MODE \ + {MIGRATE_ASYNC, "MIGRATE_ASYNC"}, \ + {MIGRATE_SYNC_LIGHT, "MIGRATE_SYNC_LIGHT"}, \ + {MIGRATE_SYNC, "MIGRATE_SYNC"} + +#define MIGRATE_REASON \ + {MR_COMPACTION, "compaction"}, \ + {MR_MEMORY_FAILURE, "memory_failure"}, \ + {MR_MEMORY_HOTPLUG, "memory_hotplug"}, \ + {MR_SYSCALL, "syscall_or_cpuset"}, \ + {MR_MEMPOLICY_MBIND, "mempolicy_mbind"}, \ + {MR_CMA, "cma"} + +TRACE_EVENT(mm_migrate_pages, + + TP_PROTO(unsigned long succeeded, unsigned long failed, + enum migrate_mode mode, int reason), + + TP_ARGS(succeeded, failed, mode, reason), + + TP_STRUCT__entry( + __field( unsigned long, succeeded) + __field( unsigned long, failed) + __field( enum migrate_mode, mode) + __field( int, reason) + ), + + TP_fast_assign( + __entry->succeeded = succeeded; + __entry->failed = failed; + __entry->mode = mode; + __entry->reason = reason; + ), + + TP_printk("nr_succeeded=%lu nr_failed=%lu mode=%s reason=%s", + __entry->succeeded, + __entry->failed, + __print_symbolic(__entry->mode, MIGRATE_MODE), + __print_symbolic(__entry->reason, MIGRATE_REASON)) +); + +#endif /* _TRACE_MIGRATE_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 23e62e0537e..0d11c3dcd3a 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -20,6 +20,7 @@ enum { MPOL_PREFERRED, MPOL_BIND, MPOL_INTERLEAVE, + MPOL_LOCAL, MPOL_MAX, /* always last member of enum */ }; @@ -47,9 +48,15 @@ enum mpol_rebind_step { /* Flags for mbind */ #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ -#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ -#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ -#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */ +#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform + to policy */ +#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */ +#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */ +#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */ + +#define MPOL_MF_VALID (MPOL_MF_STRICT | \ + MPOL_MF_MOVE | \ + MPOL_MF_MOVE_ALL) /* * Internal flags that share the struct mempolicy flags word with @@ -59,6 +66,8 @@ enum mpol_rebind_step { #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ #define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ +#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ +#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */ #endif /* _UAPI_LINUX_MEMPOLICY_H */ diff --git a/init/Kconfig b/init/Kconfig index 2054e048bb9..1a207efca59 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -717,6 +717,50 @@ config LOG_BUF_SHIFT config HAVE_UNSTABLE_SCHED_CLOCK bool +# +# For architectures that want to enable the support for NUMA-affine scheduler +# balancing logic: +# +config ARCH_SUPPORTS_NUMA_BALANCING + bool + +# For architectures that (ab)use NUMA to represent different memory regions +# all cpu-local but of different latencies, such as SuperH. +# +config ARCH_WANT_NUMA_VARIABLE_LOCALITY + bool + +# +# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE +config ARCH_WANTS_PROT_NUMA_PROT_NONE + bool + +config ARCH_USES_NUMA_PROT_NONE + bool + default y + depends on ARCH_WANTS_PROT_NUMA_PROT_NONE + depends on NUMA_BALANCING + +config NUMA_BALANCING_DEFAULT_ENABLED + bool "Automatically enable NUMA aware memory/task placement" + default y + depends on NUMA_BALANCING + help + If set, autonumic NUMA balancing will be enabled if running on a NUMA + machine. + +config NUMA_BALANCING + bool "Memory placement aware NUMA scheduler" + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when + it is references to the node the task is running on. + + This system will be inactive on UMA systems. + menuconfig CGROUPS boolean "Control Group support" depends on EVENTFD diff --git a/kernel/cred.c b/kernel/cred.c index 48cea3da6d0..8888afb846e 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -30,17 +30,6 @@ static struct kmem_cache *cred_jar; /* - * The common credentials for the initial task's thread group - */ -#ifdef CONFIG_KEYS -static struct thread_group_cred init_tgcred = { - .usage = ATOMIC_INIT(2), - .tgid = 0, - .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), -}; -#endif - -/* * The initial credentials for the initial task */ struct cred init_cred = { @@ -65,9 +54,6 @@ struct cred init_cred = { .user = INIT_USER, .user_ns = &init_user_ns, .group_info = &init_groups, -#ifdef CONFIG_KEYS - .tgcred = &init_tgcred, -#endif }; static inline void set_cred_subscribers(struct cred *cred, int n) @@ -96,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n) } /* - * Dispose of the shared task group credentials - */ -#ifdef CONFIG_KEYS -static void release_tgcred_rcu(struct rcu_head *rcu) -{ - struct thread_group_cred *tgcred = - container_of(rcu, struct thread_group_cred, rcu); - - BUG_ON(atomic_read(&tgcred->usage) != 0); - - key_put(tgcred->session_keyring); - key_put(tgcred->process_keyring); - kfree(tgcred); -} -#endif - -/* - * Release a set of thread group credentials. - */ -static void release_tgcred(struct cred *cred) -{ -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred = cred->tgcred; - - if (atomic_dec_and_test(&tgcred->usage)) - call_rcu(&tgcred->rcu, release_tgcred_rcu); -#endif -} - -/* * The RCU callback to actually dispose of a set of credentials */ static void put_cred_rcu(struct rcu_head *rcu) @@ -150,9 +106,10 @@ static void put_cred_rcu(struct rcu_head *rcu) #endif security_cred_free(cred); + key_put(cred->session_keyring); + key_put(cred->process_keyring); key_put(cred->thread_keyring); key_put(cred->request_key_auth); - release_tgcred(cred); if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); @@ -246,15 +203,6 @@ struct cred *cred_alloc_blank(void) if (!new) return NULL; -#ifdef CONFIG_KEYS - new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); - if (!new->tgcred) { - kmem_cache_free(cred_jar, new); - return NULL; - } - atomic_set(&new->tgcred->usage, 1); -#endif - atomic_set(&new->usage, 1); #ifdef CONFIG_DEBUG_CREDENTIALS new->magic = CRED_MAGIC; @@ -308,9 +256,10 @@ struct cred *prepare_creds(void) get_user_ns(new->user_ns); #ifdef CONFIG_KEYS + key_get(new->session_keyring); + key_get(new->process_keyring); key_get(new->thread_keyring); key_get(new->request_key_auth); - atomic_inc(&new->tgcred->usage); #endif #ifdef CONFIG_SECURITY @@ -334,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds); */ struct cred *prepare_exec_creds(void) { - struct thread_group_cred *tgcred = NULL; struct cred *new; -#ifdef CONFIG_KEYS - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) - return NULL; -#endif - new = prepare_creds(); - if (!new) { - kfree(tgcred); + if (!new) return new; - } #ifdef CONFIG_KEYS /* newly exec'd tasks don't get a thread keyring */ key_put(new->thread_keyring); new->thread_keyring = NULL; - /* create a new per-thread-group creds for all this set of threads to - * share */ - memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); - - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - /* inherit the session keyring; new process keyring */ - key_get(tgcred->session_keyring); - tgcred->process_keyring = NULL; - - release_tgcred(new); - new->tgcred = tgcred; + key_put(new->process_keyring); + new->process_keyring = NULL; #endif return new; @@ -383,9 +313,6 @@ struct cred *prepare_exec_creds(void) */ int copy_creds(struct task_struct *p, unsigned long clone_flags) { -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred; -#endif struct cred *new; int ret; @@ -425,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) install_thread_keyring_to_cred(new); } - /* we share the process and session keyrings between all the threads in - * a process - this is slightly icky as we violate COW credentials a - * bit */ + /* The process keyring is only shared between the threads in a process; + * anything outside of those threads doesn't inherit. + */ if (!(clone_flags & CLONE_THREAD)) { - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) { - ret = -ENOMEM; - goto error_put; - } - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - tgcred->process_keyring = NULL; - tgcred->session_keyring = key_get(new->tgcred->session_keyring); - - release_tgcred(new); - new->tgcred = tgcred; + key_put(new->process_keyring); + new->process_keyring = NULL; } #endif @@ -643,9 +560,6 @@ void __init cred_init(void) */ struct cred *prepare_kernel_cred(struct task_struct *daemon) { -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred; -#endif const struct cred *old; struct cred *new; @@ -653,14 +567,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (!new) return NULL; -#ifdef CONFIG_KEYS - tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); - if (!tgcred) { - kmem_cache_free(cred_jar, new); - return NULL; - } -#endif - kdebug("prepare_kernel_cred() alloc %p", new); if (daemon) @@ -678,13 +584,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) get_group_info(new->group_info); #ifdef CONFIG_KEYS - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - tgcred->process_keyring = NULL; - tgcred->session_keyring = NULL; - new->tgcred = tgcred; - new->request_key_auth = NULL; + new->session_keyring = NULL; + new->process_keyring = NULL; new->thread_keyring = NULL; + new->request_key_auth = NULL; new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; #endif diff --git a/kernel/fork.c b/kernel/fork.c index 3c31e874afa..115d6c2e4cc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -823,6 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif +#ifdef CONFIG_NUMA_BALANCING + mm->first_nid = NUMA_PTE_SCAN_INIT; +#endif if (!mm_init(mm, tsk)) goto fail_nomem; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0533496b622..c1fb82104bf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -193,23 +193,10 @@ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; #endif /* HAVE_JUMP_LABEL */ -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int sched_feat_set(char *cmp) { - char buf[64]; - char *cmp; - int neg = 0; int i; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); + int neg = 0; if (strncmp(cmp, "NO_", 3) == 0) { neg = 1; @@ -229,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf, } } + return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + i = sched_feat_set(cmp); if (i == __SCHED_FEAT_NR) return -EINVAL; @@ -1560,7 +1568,40 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif + +#ifdef CONFIG_NUMA_BALANCING + if (p->mm && atomic_read(&p->mm->mm_users) == 1) { + p->mm->numa_next_scan = jiffies; + p->mm->numa_next_reset = jiffies; + p->mm->numa_scan_seq = 0; + } + + p->node_stamp = 0ULL; + p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; + p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_work.next = &p->numa_work; +#endif /* CONFIG_NUMA_BALANCING */ +} + +#ifdef CONFIG_NUMA_BALANCING +#ifdef CONFIG_SCHED_DEBUG +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sched_feat_set("NUMA"); + else + sched_feat_set("NO_NUMA"); +} +#else +__read_mostly bool numabalancing_enabled; + +void set_numabalancing_state(bool enabled) +{ + numabalancing_enabled = enabled; } +#endif /* CONFIG_SCHED_DEBUG */ +#endif /* CONFIG_NUMA_BALANCING */ /* * fork()/clone()-time setup: diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 756f9f9e854..9af5af979a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -26,6 +26,9 @@ #include <linux/slab.h> #include <linux/profile.h> #include <linux/interrupt.h> +#include <linux/mempolicy.h> +#include <linux/migrate.h> +#include <linux/task_work.h> #include <trace/events/sched.h> @@ -774,6 +777,227 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#ifdef CONFIG_NUMA_BALANCING +/* + * numa task sample period in ms + */ +unsigned int sysctl_numa_balancing_scan_period_min = 100; +unsigned int sysctl_numa_balancing_scan_period_max = 100*50; +unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; + +/* Portion of address space to scan in MB */ +unsigned int sysctl_numa_balancing_scan_size = 256; + +/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ +unsigned int sysctl_numa_balancing_scan_delay = 1000; + +static void task_numa_placement(struct task_struct *p) +{ + int seq = ACCESS_ONCE(p->mm->numa_scan_seq); + + if (p->numa_scan_seq == seq) + return; + p->numa_scan_seq = seq; + + /* FIXME: Scheduling placement policy hints go here */ +} + +/* + * Got a PROT_NONE fault for a page on @node. + */ +void task_numa_fault(int node, int pages, bool migrated) +{ + struct task_struct *p = current; + + if (!sched_feat_numa(NUMA)) + return; + + /* FIXME: Allocate task-specific structure for placement policy here */ + + /* + * If pages are properly placed (did not migrate) then scan slower. + * This is reset periodically in case of phase changes + */ + if (!migrated) + p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, + p->numa_scan_period + jiffies_to_msecs(10)); + + task_numa_placement(p); +} + +static void reset_ptenuma_scan(struct task_struct *p) +{ + ACCESS_ONCE(p->mm->numa_scan_seq)++; + p->mm->numa_scan_offset = 0; +} + +/* + * The expensive part of numa migration is done from task_work context. + * Triggered from task_tick_numa(). + */ +void task_numa_work(struct callback_head *work) +{ + unsigned long migrate, next_scan, now = jiffies; + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + struct vm_area_struct *vma; + unsigned long start, end; + long pages; + + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); + + work->next = work; /* protect against double add */ + /* + * Who cares about NUMA placement when they're dying. + * + * NOTE: make sure not to dereference p->mm before this check, + * exit_task_work() happens _after_ exit_mm() so we could be called + * without p->mm even though we still had it when we enqueued this + * work. + */ + if (p->flags & PF_EXITING) + return; + + /* + * We do not care about task placement until a task runs on a node + * other than the first one used by the address space. This is + * largely because migrations are driven by what CPU the task + * is running on. If it's never scheduled on another node, it'll + * not migrate so why bother trapping the fault. + */ + if (mm->first_nid == NUMA_PTE_SCAN_INIT) + mm->first_nid = numa_node_id(); + if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { + /* Are we running on a new node yet? */ + if (numa_node_id() == mm->first_nid && + !sched_feat_numa(NUMA_FORCE)) + return; + + mm->first_nid = NUMA_PTE_SCAN_ACTIVE; + } + + /* + * Reset the scan period if enough time has gone by. Objective is that + * scanning will be reduced if pages are properly placed. As tasks + * can enter different phases this needs to be re-examined. Lacking + * proper tracking of reference behaviour, this blunt hammer is used. + */ + migrate = mm->numa_next_reset; + if (time_after(now, migrate)) { + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); + xchg(&mm->numa_next_reset, next_scan); + } + + /* + * Enforce maximal scan/migration frequency.. + */ + migrate = mm->numa_next_scan; + if (time_before(now, migrate)) + return; + + if (p->numa_scan_period == 0) + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + + next_scan = now + msecs_to_jiffies(p->numa_scan_period); + if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) + return; + + /* + * Do not set pte_numa if the current running node is rate-limited. + * This loses statistics on the fault but if we are unwilling to + * migrate to this node, it is less likely we can do useful work + */ + if (migrate_ratelimited(numa_node_id())) + return; + + start = mm->numa_scan_offset; + pages = sysctl_numa_balancing_scan_size; + pages <<= 20 - PAGE_SHIFT; /* MB in pages */ + if (!pages) + return; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); + if (!vma) { + reset_ptenuma_scan(p); + start = 0; + vma = mm->mmap; + } + for (; vma; vma = vma->vm_next) { + if (!vma_migratable(vma)) + continue; + + /* Skip small VMAs. They are not likely to be of relevance */ + if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR) + continue; + + do { + start = max(start, vma->vm_start); + end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); + end = min(end, vma->vm_end); + pages -= change_prot_numa(vma, start, end); + + start = end; + if (pages <= 0) + goto out; + } while (end != vma->vm_end); + } + +out: + /* + * It is possible to reach the end of the VMA list but the last few VMAs are + * not guaranteed to the vma_migratable. If they are not, we would find the + * !migratable VMA on the next scan but not reset the scanner to the start + * so check it now. + */ + if (vma) + mm->numa_scan_offset = start; + else + reset_ptenuma_scan(p); + up_read(&mm->mmap_sem); +} + +/* + * Drive the periodic memory faults.. + */ +void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ + struct callback_head *work = &curr->numa_work; + u64 period, now; + + /* + * We don't care about NUMA placement if we don't have memory. + */ + if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) + return; + + /* + * Using runtime rather than walltime has the dual advantage that + * we (mostly) drive the selection from busy threads and that the + * task needs to have done some actual work before we bother with + * NUMA placement. + */ + now = curr->se.sum_exec_runtime; + period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; + + if (now - curr->node_stamp > period) { + if (!curr->node_stamp) + curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; + curr->node_stamp = now; + + if (!time_before(jiffies, curr->mm->numa_next_scan)) { + init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ + task_work_add(curr, work, true); + } + } +} +#else +static void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -5501,6 +5725,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } + if (sched_feat_numa(NUMA)) + task_tick_numa(rq, curr); + update_rq_runnable_avg(rq, 1); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e68e69ab917..1ad1d2b5395 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -66,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) + +/* + * Apply the automatic NUMA scheduling policy. Enabled automatically + * at runtime if running on a NUMA machine. Can be controlled via + * numa_balancing=. Allow PTE scanning to be forced on UMA machines + * for debugging the core machinery. + */ +#ifdef CONFIG_NUMA_BALANCING +SCHED_FEAT(NUMA, false) +SCHED_FEAT(NUMA_FORCE, false) +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5eca173b563..fc886441436 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -663,6 +663,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ +#ifdef CONFIG_NUMA_BALANCING +#define sched_feat_numa(x) sched_feat(x) +#ifdef CONFIG_SCHED_DEBUG +#define numabalancing_enabled sched_feat_numa(NUMA) +#else +extern bool numabalancing_enabled; +#endif /* CONFIG_SCHED_DEBUG */ +#else +#define sched_feat_numa(x) (0) +#define numabalancing_enabled (0) +#endif /* CONFIG_NUMA_BALANCING */ + static inline u64 global_rt_period(void) { return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ee376beedaf..5af44b59377 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -396,25 +396,29 @@ int __secure_computing(int this_syscall) #ifdef CONFIG_SECCOMP_FILTER case SECCOMP_MODE_FILTER: { int data; + struct pt_regs *regs = task_pt_regs(current); ret = seccomp_run_filters(this_syscall); data = ret & SECCOMP_RET_DATA; ret &= SECCOMP_RET_ACTION; switch (ret) { case SECCOMP_RET_ERRNO: /* Set the low-order 16-bits as a errno. */ - syscall_set_return_value(current, task_pt_regs(current), + syscall_set_return_value(current, regs, -data, 0); goto skip; case SECCOMP_RET_TRAP: /* Show the handler the original registers. */ - syscall_rollback(current, task_pt_regs(current)); + syscall_rollback(current, regs); /* Let the filter pass back 16 bits of data. */ seccomp_send_sigsys(this_syscall, data); goto skip; case SECCOMP_RET_TRACE: /* Skip these calls if there is no tracer. */ - if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) + if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { + syscall_set_return_value(current, regs, + -ENOSYS, 0); goto skip; + } /* Allow the BPF to provide the event message */ ptrace_event(PTRACE_EVENT_SECCOMP, data); /* @@ -425,6 +429,9 @@ int __secure_computing(int this_syscall) */ if (fatal_signal_pending(current)) break; + if (syscall_get_nr(current, regs) < 0) + goto skip; /* Explicit request to skip. */ + return 0; case SECCOMP_RET_ALLOW: return 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 33f71f37267..c88878db491 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +#ifdef CONFIG_SMP static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -#endif +#endif /* CONFIG_SMP */ +#endif /* CONFIG_SCHED_DEBUG */ #ifdef CONFIG_COMPACTION static int min_extfrag_threshold; @@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = { .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, +#ifdef CONFIG_SMP { .procname = "sched_tunable_scaling", .data = &sysctl_sched_tunable_scaling, @@ -347,7 +350,45 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#endif +#endif /* CONFIG_SMP */ +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_scan_delay_ms", + .data = &sysctl_numa_balancing_scan_delay, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_min_ms", + .data = &sysctl_numa_balancing_scan_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_reset", + .data = &sysctl_numa_balancing_scan_period_reset, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_max_ms", + .data = &sysctl_numa_balancing_scan_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_size_mb", + .data = &sysctl_numa_balancing_scan_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, diff --git a/lib/swiotlb.c b/lib/swiotlb.c index f114bf6a8e1..196b06984de 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -57,7 +57,7 @@ int swiotlb_force; * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this * API. */ -static char *io_tlb_start, *io_tlb_end; +static phys_addr_t io_tlb_start, io_tlb_end; /* * The number of IO TLB blocks (in groups of 64) between io_tlb_start and @@ -70,7 +70,7 @@ static unsigned long io_tlb_nslabs; */ static unsigned long io_tlb_overflow = 32*1024; -static void *io_tlb_overflow_buffer; +static phys_addr_t io_tlb_overflow_buffer; /* * This is a free list describing the number of free entries available from @@ -125,27 +125,38 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, void swiotlb_print_info(void) { unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; - phys_addr_t pstart, pend; + unsigned char *vstart, *vend; - pstart = virt_to_phys(io_tlb_start); - pend = virt_to_phys(io_tlb_end); + vstart = phys_to_virt(io_tlb_start); + vend = phys_to_virt(io_tlb_end); printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n", - (unsigned long long)pstart, (unsigned long long)pend - 1, - bytes >> 20, io_tlb_start, io_tlb_end - 1); + (unsigned long long)io_tlb_start, + (unsigned long long)io_tlb_end, + bytes >> 20, vstart, vend - 1); } void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { + void *v_overflow_buffer; unsigned long i, bytes; bytes = nslabs << IO_TLB_SHIFT; io_tlb_nslabs = nslabs; - io_tlb_start = tlb; + io_tlb_start = __pa(tlb); io_tlb_end = io_tlb_start + bytes; /* + * Get the overflow emergency buffer + */ + v_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow)); + if (!v_overflow_buffer) + panic("Cannot allocate SWIOTLB overflow buffer!\n"); + + io_tlb_overflow_buffer = __pa(v_overflow_buffer); + + /* * Allocate and initialize the free list array. This array is used * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE * between io_tlb_start and io_tlb_end. @@ -156,12 +167,6 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) io_tlb_index = 0; io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); - /* - * Get the overflow emergency buffer - */ - io_tlb_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow)); - if (!io_tlb_overflow_buffer) - panic("Cannot allocate SWIOTLB overflow buffer!\n"); if (verbose) swiotlb_print_info(); } @@ -173,6 +178,7 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) static void __init swiotlb_init_with_default_size(size_t default_size, int verbose) { + unsigned char *vstart; unsigned long bytes; if (!io_tlb_nslabs) { @@ -185,11 +191,11 @@ swiotlb_init_with_default_size(size_t default_size, int verbose) /* * Get IO TLB memory from the low pages */ - io_tlb_start = alloc_bootmem_low_pages(PAGE_ALIGN(bytes)); - if (!io_tlb_start) + vstart = alloc_bootmem_low_pages(PAGE_ALIGN(bytes)); + if (!vstart) panic("Cannot allocate SWIOTLB buffer"); - swiotlb_init_with_tbl(io_tlb_start, io_tlb_nslabs, verbose); + swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose); } void __init @@ -207,6 +213,7 @@ int swiotlb_late_init_with_default_size(size_t default_size) { unsigned long bytes, req_nslabs = io_tlb_nslabs; + unsigned char *vstart = NULL; unsigned int order; int rc = 0; @@ -223,14 +230,14 @@ swiotlb_late_init_with_default_size(size_t default_size) bytes = io_tlb_nslabs << IO_TLB_SHIFT; while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { - io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, - order); - if (io_tlb_start) + vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, + order); + if (vstart) break; order--; } - if (!io_tlb_start) { + if (!vstart) { io_tlb_nslabs = req_nslabs; return -ENOMEM; } @@ -239,9 +246,9 @@ swiotlb_late_init_with_default_size(size_t default_size) "for software IO TLB\n", (PAGE_SIZE << order) >> 20); io_tlb_nslabs = SLABS_PER_PAGE << order; } - rc = swiotlb_late_init_with_tbl(io_tlb_start, io_tlb_nslabs); + rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); if (rc) - free_pages((unsigned long)io_tlb_start, order); + free_pages((unsigned long)vstart, order); return rc; } @@ -249,14 +256,25 @@ int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { unsigned long i, bytes; + unsigned char *v_overflow_buffer; bytes = nslabs << IO_TLB_SHIFT; io_tlb_nslabs = nslabs; - io_tlb_start = tlb; + io_tlb_start = virt_to_phys(tlb); io_tlb_end = io_tlb_start + bytes; - memset(io_tlb_start, 0, bytes); + memset(tlb, 0, bytes); + + /* + * Get the overflow emergency buffer + */ + v_overflow_buffer = (void *)__get_free_pages(GFP_DMA, + get_order(io_tlb_overflow)); + if (!v_overflow_buffer) + goto cleanup2; + + io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer); /* * Allocate and initialize the free list array. This array is used @@ -266,7 +284,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, get_order(io_tlb_nslabs * sizeof(int))); if (!io_tlb_list) - goto cleanup2; + goto cleanup3; for (i = 0; i < io_tlb_nslabs; i++) io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); @@ -277,18 +295,10 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) get_order(io_tlb_nslabs * sizeof(phys_addr_t))); if (!io_tlb_orig_addr) - goto cleanup3; + goto cleanup4; memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(phys_addr_t)); - /* - * Get the overflow emergency buffer - */ - io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA, - get_order(io_tlb_overflow)); - if (!io_tlb_overflow_buffer) - goto cleanup4; - swiotlb_print_info(); late_alloc = 1; @@ -296,42 +306,42 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) return 0; cleanup4: - free_pages((unsigned long)io_tlb_orig_addr, - get_order(io_tlb_nslabs * sizeof(phys_addr_t))); - io_tlb_orig_addr = NULL; -cleanup3: free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * sizeof(int))); io_tlb_list = NULL; +cleanup3: + free_pages((unsigned long)v_overflow_buffer, + get_order(io_tlb_overflow)); + io_tlb_overflow_buffer = 0; cleanup2: - io_tlb_end = NULL; - io_tlb_start = NULL; + io_tlb_end = 0; + io_tlb_start = 0; io_tlb_nslabs = 0; return -ENOMEM; } void __init swiotlb_free(void) { - if (!io_tlb_overflow_buffer) + if (!io_tlb_orig_addr) return; if (late_alloc) { - free_pages((unsigned long)io_tlb_overflow_buffer, + free_pages((unsigned long)phys_to_virt(io_tlb_overflow_buffer), get_order(io_tlb_overflow)); free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * sizeof(phys_addr_t))); free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * sizeof(int))); - free_pages((unsigned long)io_tlb_start, + free_pages((unsigned long)phys_to_virt(io_tlb_start), get_order(io_tlb_nslabs << IO_TLB_SHIFT)); } else { - free_bootmem_late(__pa(io_tlb_overflow_buffer), + free_bootmem_late(io_tlb_overflow_buffer, PAGE_ALIGN(io_tlb_overflow)); free_bootmem_late(__pa(io_tlb_orig_addr), PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); free_bootmem_late(__pa(io_tlb_list), PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); - free_bootmem_late(__pa(io_tlb_start), + free_bootmem_late(io_tlb_start, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); } io_tlb_nslabs = 0; @@ -339,21 +349,21 @@ void __init swiotlb_free(void) static int is_swiotlb_buffer(phys_addr_t paddr) { - return paddr >= virt_to_phys(io_tlb_start) && - paddr < virt_to_phys(io_tlb_end); + return paddr >= io_tlb_start && paddr < io_tlb_end; } /* * Bounce: copy the swiotlb buffer back to the original dma location */ -void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, - enum dma_data_direction dir) +static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir) { - unsigned long pfn = PFN_DOWN(phys); + unsigned long pfn = PFN_DOWN(orig_addr); + unsigned char *vaddr = phys_to_virt(tlb_addr); if (PageHighMem(pfn_to_page(pfn))) { /* The buffer does not have a mapping. Map it in and copy */ - unsigned int offset = phys & ~PAGE_MASK; + unsigned int offset = orig_addr & ~PAGE_MASK; char *buffer; unsigned int sz = 0; unsigned long flags; @@ -364,32 +374,31 @@ void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, local_irq_save(flags); buffer = kmap_atomic(pfn_to_page(pfn)); if (dir == DMA_TO_DEVICE) - memcpy(dma_addr, buffer + offset, sz); + memcpy(vaddr, buffer + offset, sz); else - memcpy(buffer + offset, dma_addr, sz); + memcpy(buffer + offset, vaddr, sz); kunmap_atomic(buffer); local_irq_restore(flags); size -= sz; pfn++; - dma_addr += sz; + vaddr += sz; offset = 0; } + } else if (dir == DMA_TO_DEVICE) { + memcpy(vaddr, phys_to_virt(orig_addr), size); } else { - if (dir == DMA_TO_DEVICE) - memcpy(dma_addr, phys_to_virt(phys), size); - else - memcpy(phys_to_virt(phys), dma_addr, size); + memcpy(phys_to_virt(orig_addr), vaddr, size); } } -EXPORT_SYMBOL_GPL(swiotlb_bounce); -void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, - phys_addr_t phys, size_t size, - enum dma_data_direction dir) +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, + dma_addr_t tbl_dma_addr, + phys_addr_t orig_addr, size_t size, + enum dma_data_direction dir) { unsigned long flags; - char *dma_addr; + phys_addr_t tlb_addr; unsigned int nslots, stride, index, wrap; int i; unsigned long mask; @@ -453,7 +462,7 @@ void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, io_tlb_list[i] = 0; for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) io_tlb_list[i] = ++count; - dma_addr = io_tlb_start + (index << IO_TLB_SHIFT); + tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT); /* * Update the indices to avoid searching in the next @@ -471,7 +480,7 @@ void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr, not_found: spin_unlock_irqrestore(&io_tlb_lock, flags); - return NULL; + return SWIOTLB_MAP_ERROR; found: spin_unlock_irqrestore(&io_tlb_lock, flags); @@ -481,11 +490,11 @@ found: * needed. */ for (i = 0; i < nslots; i++) - io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT); + io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) - swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE); + swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); - return dma_addr; + return tlb_addr; } EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single); @@ -493,11 +502,10 @@ EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single); * Allocates bounce buffer and returns its kernel virtual address. */ -static void * -map_single(struct device *hwdev, phys_addr_t phys, size_t size, - enum dma_data_direction dir) +phys_addr_t map_single(struct device *hwdev, phys_addr_t phys, size_t size, + enum dma_data_direction dir) { - dma_addr_t start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start); + dma_addr_t start_dma_addr = phys_to_dma(hwdev, io_tlb_start); return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, dir); } @@ -505,20 +513,19 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size, /* * dma_addr is the kernel virtual address of the bounce buffer to unmap. */ -void -swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size, - enum dma_data_direction dir) +void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir) { unsigned long flags; int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; - phys_addr_t phys = io_tlb_orig_addr[index]; + int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + phys_addr_t orig_addr = io_tlb_orig_addr[index]; /* * First, sync the memory before unmapping the entry */ - if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) - swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE); + if (orig_addr && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) + swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE); /* * Return the buffer to the free list by setting the corresponding @@ -547,26 +554,27 @@ swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size, } EXPORT_SYMBOL_GPL(swiotlb_tbl_unmap_single); -void -swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size, - enum dma_data_direction dir, - enum dma_sync_target target) +void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir, + enum dma_sync_target target) { - int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT; - phys_addr_t phys = io_tlb_orig_addr[index]; + int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + phys_addr_t orig_addr = io_tlb_orig_addr[index]; - phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1)); + orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1); switch (target) { case SYNC_FOR_CPU: if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE); + swiotlb_bounce(orig_addr, tlb_addr, + size, DMA_FROM_DEVICE); else BUG_ON(dir != DMA_TO_DEVICE); break; case SYNC_FOR_DEVICE: if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE); + swiotlb_bounce(orig_addr, tlb_addr, + size, DMA_TO_DEVICE); else BUG_ON(dir != DMA_FROM_DEVICE); break; @@ -589,12 +597,15 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_mask = hwdev->coherent_dma_mask; ret = (void *)__get_free_pages(flags, order); - if (ret && swiotlb_virt_to_bus(hwdev, ret) + size - 1 > dma_mask) { - /* - * The allocated memory isn't reachable by the device. - */ - free_pages((unsigned long) ret, order); - ret = NULL; + if (ret) { + dev_addr = swiotlb_virt_to_bus(hwdev, ret); + if (dev_addr + size - 1 > dma_mask) { + /* + * The allocated memory isn't reachable by the device. + */ + free_pages((unsigned long) ret, order); + ret = NULL; + } } if (!ret) { /* @@ -602,25 +613,29 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, * GFP_DMA memory; fall back on map_single(), which * will grab memory from the lowest available address range. */ - ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE); - if (!ret) + phys_addr_t paddr = map_single(hwdev, 0, size, DMA_FROM_DEVICE); + if (paddr == SWIOTLB_MAP_ERROR) return NULL; - } - memset(ret, 0, size); - dev_addr = swiotlb_virt_to_bus(hwdev, ret); + ret = phys_to_virt(paddr); + dev_addr = phys_to_dma(hwdev, paddr); - /* Confirm address can be DMA'd by device */ - if (dev_addr + size - 1 > dma_mask) { - printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", - (unsigned long long)dma_mask, - (unsigned long long)dev_addr); + /* Confirm address can be DMA'd by device */ + if (dev_addr + size - 1 > dma_mask) { + printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", + (unsigned long long)dma_mask, + (unsigned long long)dev_addr); - /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ - swiotlb_tbl_unmap_single(hwdev, ret, size, DMA_TO_DEVICE); - return NULL; + /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ + swiotlb_tbl_unmap_single(hwdev, paddr, + size, DMA_TO_DEVICE); + return NULL; + } } + *dma_handle = dev_addr; + memset(ret, 0, size); + return ret; } EXPORT_SYMBOL(swiotlb_alloc_coherent); @@ -636,7 +651,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); else /* DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single */ - swiotlb_tbl_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); + swiotlb_tbl_unmap_single(hwdev, paddr, size, DMA_TO_DEVICE); } EXPORT_SYMBOL(swiotlb_free_coherent); @@ -677,9 +692,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, enum dma_data_direction dir, struct dma_attrs *attrs) { - phys_addr_t phys = page_to_phys(page) + offset; + phys_addr_t map, phys = page_to_phys(page) + offset; dma_addr_t dev_addr = phys_to_dma(dev, phys); - void *map; BUG_ON(dir == DMA_NONE); /* @@ -690,23 +704,19 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, if (dma_capable(dev, dev_addr, size) && !swiotlb_force) return dev_addr; - /* - * Oh well, have to allocate and map a bounce buffer. - */ + /* Oh well, have to allocate and map a bounce buffer. */ map = map_single(dev, phys, size, dir); - if (!map) { + if (map == SWIOTLB_MAP_ERROR) { swiotlb_full(dev, size, dir, 1); - map = io_tlb_overflow_buffer; + return phys_to_dma(dev, io_tlb_overflow_buffer); } - dev_addr = swiotlb_virt_to_bus(dev, map); + dev_addr = phys_to_dma(dev, map); - /* - * Ensure that the address returned is DMA'ble - */ + /* Ensure that the address returned is DMA'ble */ if (!dma_capable(dev, dev_addr, size)) { swiotlb_tbl_unmap_single(dev, map, size, dir); - dev_addr = swiotlb_virt_to_bus(dev, io_tlb_overflow_buffer); + return phys_to_dma(dev, io_tlb_overflow_buffer); } return dev_addr; @@ -729,7 +739,7 @@ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(paddr)) { - swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir); + swiotlb_tbl_unmap_single(hwdev, paddr, size, dir); return; } @@ -773,8 +783,7 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); if (is_swiotlb_buffer(paddr)) { - swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir, - target); + swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); return; } @@ -831,9 +840,9 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, if (swiotlb_force || !dma_capable(hwdev, dev_addr, sg->length)) { - void *map = map_single(hwdev, sg_phys(sg), - sg->length, dir); - if (!map) { + phys_addr_t map = map_single(hwdev, sg_phys(sg), + sg->length, dir); + if (map == SWIOTLB_MAP_ERROR) { /* Don't panic here, we expect map_sg users to do proper error handling. */ swiotlb_full(hwdev, sg->length, dir, 0); @@ -842,7 +851,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, sgl[0].dma_length = 0; return 0; } - sg->dma_address = swiotlb_virt_to_bus(hwdev, map); + sg->dma_address = phys_to_dma(hwdev, map); } else sg->dma_address = dev_addr; sg->dma_length = sg->length; @@ -925,7 +934,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_device); int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) { - return (dma_addr == swiotlb_virt_to_bus(hwdev, io_tlb_overflow_buffer)); + return (dma_addr == phys_to_dma(hwdev, io_tlb_overflow_buffer)); } EXPORT_SYMBOL(swiotlb_dma_mapping_error); @@ -938,6 +947,6 @@ EXPORT_SYMBOL(swiotlb_dma_mapping_error); int swiotlb_dma_supported(struct device *hwdev, u64 mask) { - return swiotlb_virt_to_bus(hwdev, io_tlb_end - 1) <= mask; + return phys_to_dma(hwdev, io_tlb_end - 1) <= mask; } EXPORT_SYMBOL(swiotlb_dma_supported); diff --git a/mm/compaction.c b/mm/compaction.c index 12979121822..5ad7f4f4d6f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -303,6 +303,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (blockpfn == end_pfn) update_pageblock_skip(cc, valid_page, total_isolated, false); + count_vm_events(COMPACTFREE_SCANNED, nr_scanned); + if (total_isolated) + count_vm_events(COMPACTISOLATED, total_isolated); + return total_isolated; } @@ -609,6 +613,10 @@ next_pageblock: trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); + count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned); + if (nr_isolated) + count_vm_events(COMPACTISOLATED, nr_isolated); + return low_pfn; } @@ -1015,14 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, (unsigned long)cc, false, - cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); + cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, + MR_COMPACTION); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; - count_vm_event(COMPACTBLOCKS); - count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); - if (nr_remaining) - count_vm_events(COMPACTPAGEFAILED, nr_remaining); trace_mm_compaction_migratepages(nr_migrate - nr_remaining, nr_remaining); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 827d9c81305..d7ee1691fd2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -19,6 +19,7 @@ #include <linux/freezer.h> #include <linux/mman.h> #include <linux/pagemap.h> +#include <linux/migrate.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -690,7 +691,7 @@ out: } __setup("transparent_hugepage=", setup_transparent_hugepage); -static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pmd = pmd_mkwrite(pmd); @@ -848,7 +849,8 @@ out: * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(__pte_alloc(mm, vma, pmd, address))) + if (unlikely(pmd_none(*pmd)) && + unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ if (unlikely(pmd_trans_huge(*pmd))) @@ -1287,6 +1289,81 @@ out: return page; } +/* NUMA hinting page fault entry point for trans huge pmds */ +int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp) +{ + struct page *page; + unsigned long haddr = addr & HPAGE_PMD_MASK; + int target_nid; + int current_nid = -1; + bool migrated; + bool page_locked = false; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) + goto out_unlock; + + page = pmd_page(pmd); + get_page(page); + current_nid = page_to_nid(page); + count_vm_numa_event(NUMA_HINT_FAULTS); + if (current_nid == numa_node_id()) + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + + target_nid = mpol_misplaced(page, vma, haddr); + if (target_nid == -1) { + put_page(page); + goto clear_pmdnuma; + } + + /* Acquire the page lock to serialise THP migrations */ + spin_unlock(&mm->page_table_lock); + lock_page(page); + page_locked = true; + + /* Confirm the PTE did not while locked */ + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) { + unlock_page(page); + put_page(page); + goto out_unlock; + } + spin_unlock(&mm->page_table_lock); + + /* Migrate the THP to the requested node */ + migrated = migrate_misplaced_transhuge_page(mm, vma, + pmdp, pmd, addr, + page, target_nid); + if (migrated) + current_nid = target_nid; + else { + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) { + unlock_page(page); + goto out_unlock; + } + goto clear_pmdnuma; + } + + task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); + return 0; + +clear_pmdnuma: + pmd = pmd_mknonnuma(pmd); + set_pmd_at(mm, haddr, pmdp, pmd); + VM_BUG_ON(pmd_numa(*pmdp)); + update_mmu_cache_pmd(vma, addr, pmdp); + if (page_locked) + unlock_page(page); + +out_unlock: + spin_unlock(&mm->page_table_lock); + if (current_nid != -1) + task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); + return 0; +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { @@ -1375,7 +1452,7 @@ out: } int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot) + unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; int ret = 0; @@ -1383,7 +1460,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma) == 1) { pmd_t entry; entry = pmdp_get_and_clear(mm, addr, pmd); - entry = pmd_modify(entry, newprot); + if (!prot_numa) + entry = pmd_modify(entry, newprot); + else { + struct page *page = pmd_page(*pmd); + + /* only check non-shared pages */ + if (page_mapcount(page) == 1 && + !pmd_numa(*pmd)) { + entry = pmd_mknuma(entry); + } + } BUG_ON(pmd_write(entry)); set_pmd_at(mm, addr, pmd, entry); spin_unlock(&vma->vm_mm->page_table_lock); @@ -1474,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page, * We can't temporarily set the pmd to null in order * to split it, the pmd must remain marked huge at all * times or the VM won't take the pmd_trans_huge paths - * and it won't wait on the anon_vma->root->mutex to + * and it won't wait on the anon_vma->root->rwsem to * serialize against split_huge_page*. */ pmdp_splitting_flush(vma, address, pmd); @@ -1565,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page) page_tail->mapping = page->mapping; page_tail->index = page->index + i; + page_xchg_last_nid(page_tail, page_last_nid(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); @@ -1632,6 +1720,8 @@ static int __split_huge_page_map(struct page *page, BUG_ON(page_mapcount(page) != 1); if (!pmd_young(*pmd)) entry = pte_mkold(entry); + if (pmd_numa(*pmd)) + entry = pte_mknuma(entry); pte = pte_offset_map(&_pmd, haddr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); @@ -1674,7 +1764,7 @@ static int __split_huge_page_map(struct page *page, return ret; } -/* must be called with anon_vma->root->mutex hold */ +/* must be called with anon_vma->root->rwsem held */ static void __split_huge_page(struct page *page, struct anon_vma *anon_vma) { @@ -1729,7 +1819,7 @@ int split_huge_page(struct page *page) BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); BUG_ON(!PageAnon(page)); - anon_vma = page_lock_anon_vma(page); + anon_vma = page_lock_anon_vma_read(page); if (!anon_vma) goto out; ret = 0; @@ -1742,7 +1832,7 @@ int split_huge_page(struct page *page) BUG_ON(PageCompound(page)); out_unlock: - page_unlock_anon_vma(anon_vma); + page_unlock_anon_vma_read(anon_vma); out: return ret; } @@ -2234,7 +2324,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (pmd_trans_huge(*pmd)) goto out; - anon_vma_lock(vma->anon_vma); + anon_vma_lock_write(vma->anon_vma); pte = pte_offset_map(pmd, address); ptl = pte_lockptr(mm, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 88e7293b96b..e5318c7793a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3016,7 +3016,7 @@ same_page: return i ? i : -EFAULT; } -void hugetlb_change_protection(struct vm_area_struct *vma, +unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) { struct mm_struct *mm = vma->vm_mm; @@ -3024,6 +3024,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); + unsigned long pages = 0; BUG_ON(address >= end); flush_cache_range(vma, address, end); @@ -3034,12 +3035,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma, ptep = huge_pte_offset(mm, address); if (!ptep) continue; - if (huge_pmd_unshare(mm, &address, ptep)) + if (huge_pmd_unshare(mm, &address, ptep)) { + pages++; continue; + } if (!huge_pte_none(huge_ptep_get(ptep))) { pte = huge_ptep_get_and_clear(mm, address, ptep); pte = pte_mkhuge(pte_modify(pte, newprot)); set_huge_pte_at(mm, address, ptep, pte); + pages++; } } spin_unlock(&mm->page_table_lock); @@ -3051,6 +3055,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma, */ flush_tlb_range(vma, start, end); mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + + return pages << h->order; } int hugetlb_reserve_pages(struct inode *inode, diff --git a/mm/internal.h b/mm/internal.h index 52d1fa95719..d597f94cc20 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -217,15 +217,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) { if (TestClearPageMlocked(page)) { unsigned long flags; + int nr_pages = hpage_nr_pages(page); local_irq_save(flags); - __dec_zone_page_state(page, NR_MLOCK); + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); SetPageMlocked(newpage); - __inc_zone_page_state(newpage, NR_MLOCK); + __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); local_irq_restore(flags); } } +extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma); @@ -1624,7 +1624,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1678,7 +1678,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; @@ -1731,7 +1731,7 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { vma = vmac->vma; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6c055929c8c..bbfac5063ca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3289,15 +3289,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **memcgp) { struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; struct page_cgroup *pc; enum charge_type ctype; *memcgp = NULL; - VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return; + if (PageTransHuge(page)) + nr_pages <<= compound_order(page); + pc = lookup_page_cgroup(page); lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { @@ -3359,7 +3362,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, * charged to the res_counter since we plan on replacing the * old one and only one page is going to be left afterwards. */ - __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); + __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); } /* remove redundant charge if migration failed*/ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 108c52fa60f..c6e4dd3e1c0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct anon_vma *av; pgoff_t pgoff; - av = page_lock_anon_vma(page); + av = page_lock_anon_vma_read(page); if (av == NULL) /* Not actually mapped anymore */ return; @@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, } } read_unlock(&tasklist_lock); - page_unlock_anon_vma(av); + page_unlock_anon_vma_read(av); } /* @@ -1566,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags) page_is_file_cache(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, - false, MIGRATE_SYNC); + false, MIGRATE_SYNC, + MR_MEMORY_FAILURE); if (ret) { putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", diff --git a/mm/memory.c b/mm/memory.c index db2e9e797a0..e6a3b933517 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -57,6 +57,7 @@ #include <linux/swapops.h> #include <linux/elf.h> #include <linux/gfp.h> +#include <linux/migrate.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -1503,6 +1504,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + goto no_page_table; if (pmd_trans_huge(*pmd)) { if (flags & FOLL_SPLIT) { split_huge_page_pmd(vma, address, pmd); @@ -1532,6 +1535,8 @@ split_fallthrough: pte = *ptep; if (!pte_present(pte)) goto no_page; + if ((flags & FOLL_NUMA) && pte_numa(pte)) + goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; @@ -1683,6 +1688,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (gup_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + + /* + * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault + * would be called on PROT_NONE ranges. We must never invoke + * handle_mm_fault on PROT_NONE ranges or the NUMA hinting + * page faults would unprotect the PROT_NONE ranges if + * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd + * bitflag. So to avoid that, don't set FOLL_NUMA if + * FOLL_FORCE is set. + */ + if (!(gup_flags & FOLL_FORCE)) + gup_flags |= FOLL_NUMA; + i = 0; do { @@ -3412,6 +3430,169 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } +int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, + unsigned long addr, int current_nid) +{ + get_page(page); + + count_vm_numa_event(NUMA_HINT_FAULTS); + if (current_nid == numa_node_id()) + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + + return mpol_misplaced(page, vma, addr); +} + +int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) +{ + struct page *page = NULL; + spinlock_t *ptl; + int current_nid = -1; + int target_nid; + bool migrated = false; + + /* + * The "pte" at this point cannot be used safely without + * validation through pte_unmap_same(). It's of NUMA type but + * the pfn may be screwed if the read is non atomic. + * + * ptep_modify_prot_start is not called as this is clearing + * the _PAGE_NUMA bit and it is not really expected that there + * would be concurrent hardware modifications to the PTE. + */ + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (unlikely(!pte_same(*ptep, pte))) { + pte_unmap_unlock(ptep, ptl); + goto out; + } + + pte = pte_mknonnuma(pte); + set_pte_at(mm, addr, ptep, pte); + update_mmu_cache(vma, addr, ptep); + + page = vm_normal_page(vma, addr, pte); + if (!page) { + pte_unmap_unlock(ptep, ptl); + return 0; + } + + current_nid = page_to_nid(page); + target_nid = numa_migrate_prep(page, vma, addr, current_nid); + pte_unmap_unlock(ptep, ptl); + if (target_nid == -1) { + /* + * Account for the fault against the current node if it not + * being replaced regardless of where the page is located. + */ + current_nid = numa_node_id(); + put_page(page); + goto out; + } + + /* Migrate to the requested node */ + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) + current_nid = target_nid; + +out: + if (current_nid != -1) + task_numa_fault(current_nid, 1, migrated); + return 0; +} + +/* NUMA hinting page fault entry point for regular pmds */ +#ifdef CONFIG_NUMA_BALANCING +static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t pmd; + pte_t *pte, *orig_pte; + unsigned long _addr = addr & PMD_MASK; + unsigned long offset; + spinlock_t *ptl; + bool numa = false; + int local_nid = numa_node_id(); + + spin_lock(&mm->page_table_lock); + pmd = *pmdp; + if (pmd_numa(pmd)) { + set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); + numa = true; + } + spin_unlock(&mm->page_table_lock); + + if (!numa) + return 0; + + /* we're in a page fault so some vma must be in the range */ + BUG_ON(!vma); + BUG_ON(vma->vm_start >= _addr + PMD_SIZE); + offset = max(_addr, vma->vm_start) & ~PMD_MASK; + VM_BUG_ON(offset >= PMD_SIZE); + orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); + pte += offset >> PAGE_SHIFT; + for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { + pte_t pteval = *pte; + struct page *page; + int curr_nid = local_nid; + int target_nid; + bool migrated; + if (!pte_present(pteval)) + continue; + if (!pte_numa(pteval)) + continue; + if (addr >= vma->vm_end) { + vma = find_vma(mm, addr); + /* there's a pte present so there must be a vma */ + BUG_ON(!vma); + BUG_ON(addr < vma->vm_start); + } + if (pte_numa(pteval)) { + pteval = pte_mknonnuma(pteval); + set_pte_at(mm, addr, pte, pteval); + } + page = vm_normal_page(vma, addr, pteval); + if (unlikely(!page)) + continue; + /* only check non-shared pages */ + if (unlikely(page_mapcount(page) != 1)) + continue; + + /* + * Note that the NUMA fault is later accounted to either + * the node that is currently running or where the page is + * migrated to. + */ + curr_nid = local_nid; + target_nid = numa_migrate_prep(page, vma, addr, + page_to_nid(page)); + if (target_nid == -1) { + put_page(page); + continue; + } + + /* Migrate to the requested node */ + pte_unmap_unlock(pte, ptl); + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) + curr_nid = target_nid; + task_numa_fault(curr_nid, 1, migrated); + + pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); + } + pte_unmap_unlock(orig_pte, ptl); + + return 0; +} +#else +static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + BUG(); +} +#endif /* CONFIG_NUMA_BALANCING */ + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3450,6 +3631,9 @@ int handle_pte_fault(struct mm_struct *mm, pte, pmd, flags, entry); } + if (pte_numa(entry)) + return do_numa_page(mm, vma, address, entry, pte, pmd); + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) @@ -3520,8 +3704,11 @@ retry: if (pmd_trans_huge(orig_pmd)) { unsigned int dirty = flags & FAULT_FLAG_WRITE; - if (dirty && !pmd_write(orig_pmd) && - !pmd_trans_splitting(orig_pmd)) { + if (pmd_numa(orig_pmd)) + return do_huge_pmd_numa_page(mm, vma, address, + orig_pmd, pmd); + + if (dirty && !pmd_write(orig_pmd)) { ret = do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); /* @@ -3536,16 +3723,21 @@ retry: huge_pmd_set_accessed(mm, vma, address, pmd, orig_pmd, dirty); } + return 0; } } + if (pmd_numa(*pmd)) + return do_pmd_numa_page(mm, vma, address, pmd); + /* * Use __pte_alloc instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) + if (unlikely(pmd_none(*pmd)) && + unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ if (unlikely(pmd_trans_huge(*pmd))) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 518baa896e8..962e353aa86 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1055,7 +1055,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * migrate_pages returns # of failed pages. */ ret = migrate_pages(&source, alloc_migrate_target, 0, - true, MIGRATE_SYNC); + true, MIGRATE_SYNC, + MR_MEMORY_HOTPLUG); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index aaf54566cb6..d1b315e9862 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -90,6 +90,7 @@ #include <linux/syscalls.h> #include <linux/ctype.h> #include <linux/mm_inline.h> +#include <linux/mmu_notifier.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -117,6 +118,26 @@ static struct mempolicy default_policy = { .flags = MPOL_F_LOCAL, }; +static struct mempolicy preferred_node_policy[MAX_NUMNODES]; + +static struct mempolicy *get_task_policy(struct task_struct *p) +{ + struct mempolicy *pol = p->mempolicy; + int node; + + if (!pol) { + node = numa_node_id(); + if (node != -1) + pol = &preferred_node_policy[node]; + + /* preferred_node_policy is not initialised early in boot */ + if (!pol->mode) + pol = NULL; + } + + return pol; +} + static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); /* @@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); - return NULL; /* simply delete any existing policy */ + return NULL; } VM_BUG_ON(!nodes); @@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); } + } else if (mode == MPOL_LOCAL) { + if (!nodes_empty(*nodes)) + return ERR_PTR(-EINVAL); + mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma, return 0; } +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +/* + * This is used to mark a range of virtual addresses to be inaccessible. + * These are later cleared by a NUMA hinting fault. Depending on these + * faults, pages may be migrated for better NUMA placement. + * + * This is assuming that NUMA faults are handled using PROT_NONE. If + * an architecture makes a different choice, it will need further + * changes to the core. + */ +unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + int nr_updated; + BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); + + nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); + if (nr_updated) + count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); + + return nr_updated; +} +#else +static unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + return 0; +} +#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ + /* * Check if all pages in a range are on a set of nodes. * If pagelist != NULL then isolate pages from the LRU and @@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return ERR_PTR(-EFAULT); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { + unsigned long endvma = vma->vm_end; + + if (endvma > end) + endvma = end; + if (vma->vm_start > start) + start = vma->vm_start; + if (!(flags & MPOL_MF_DISCONTIG_OK)) { if (!vma->vm_next && vma->vm_end < end) return ERR_PTR(-EFAULT); if (prev && prev->vm_end < vma->vm_start) return ERR_PTR(-EFAULT); } - if (!is_vm_hugetlb_page(vma) && - ((flags & MPOL_MF_STRICT) || + + if (is_vm_hugetlb_page(vma)) + goto next; + + if (flags & MPOL_MF_LAZY) { + change_prot_numa(vma, start, endvma); + goto next; + } + + if ((flags & MPOL_MF_STRICT) || ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma)))) { - unsigned long endvma = vma->vm_end; + vma_migratable(vma))) { - if (endvma > end) - endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; err = check_pgd_range(vma, start, endvma, nodes, flags, private); if (err) { @@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, break; } } +next: prev = vma; } return first; @@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_node_page, dest, - false, MIGRATE_SYNC); + false, MIGRATE_SYNC, + MR_SYSCALL); if (err) putback_lru_pages(&pagelist); } @@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len, int err; LIST_HEAD(pagelist); - if (flags & ~(unsigned long)(MPOL_MF_STRICT | - MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; @@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len, if (IS_ERR(new)) return PTR_ERR(new); + if (flags & MPOL_MF_LAZY) + new->flags |= MPOL_F_MOF; + /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all @@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len, vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); - err = PTR_ERR(vma); - if (!IS_ERR(vma)) { - int nr_failed = 0; - + err = PTR_ERR(vma); /* maybe ... */ + if (!IS_ERR(vma)) err = mbind_range(mm, start, end, new); + if (!err) { + int nr_failed = 0; + if (!list_empty(&pagelist)) { + WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_vma_page, (unsigned long)vma, - false, MIGRATE_SYNC); + false, MIGRATE_SYNC, + MR_MEMPOLICY_MBIND); if (nr_failed) putback_lru_pages(&pagelist); } - if (!err && nr_failed && (flags & MPOL_MF_STRICT)) + if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } else putback_lru_pages(&pagelist); @@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, struct mempolicy *get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { - struct mempolicy *pol = task->mempolicy; + struct mempolicy *pol = get_task_policy(task); if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { @@ -1956,7 +2028,7 @@ retry_cpuset: */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { - struct mempolicy *pol = current->mempolicy; + struct mempolicy *pol = get_task_policy(current); struct page *page; unsigned int cpuset_mems_cookie; @@ -2140,6 +2212,115 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } +/** + * mpol_misplaced - check whether current page node is valid in policy + * + * @page - page to be checked + * @vma - vm area where page mapped + * @addr - virtual address where page mapped + * + * Lookup current policy node id for vma,addr and "compare to" page's + * node id. + * + * Returns: + * -1 - not misplaced, page is in the right node + * node - node id where the page should be + * + * Policy determination "mimics" alloc_page_vma(). + * Called from fault path where we know the vma and faulting address. + */ +int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol; + struct zone *zone; + int curnid = page_to_nid(page); + unsigned long pgoff; + int polnid = -1; + int ret = -1; + + BUG_ON(!vma); + + pol = get_vma_policy(current, vma, addr); + if (!(pol->flags & MPOL_F_MOF)) + goto out; + + switch (pol->mode) { + case MPOL_INTERLEAVE: + BUG_ON(addr >= vma->vm_end); + BUG_ON(addr < vma->vm_start); + + pgoff = vma->vm_pgoff; + pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; + polnid = offset_il_node(pol, vma, pgoff); + break; + + case MPOL_PREFERRED: + if (pol->flags & MPOL_F_LOCAL) + polnid = numa_node_id(); + else + polnid = pol->v.preferred_node; + break; + + case MPOL_BIND: + /* + * allows binding to multiple nodes. + * use current page if in policy nodemask, + * else select nearest allowed node, if any. + * If no allowed nodes, use current [!misplaced]. + */ + if (node_isset(curnid, pol->v.nodes)) + goto out; + (void)first_zones_zonelist( + node_zonelist(numa_node_id(), GFP_HIGHUSER), + gfp_zone(GFP_HIGHUSER), + &pol->v.nodes, &zone); + polnid = zone->node; + break; + + default: + BUG(); + } + + /* Migrate the page towards the node whose CPU is referencing it */ + if (pol->flags & MPOL_F_MORON) { + int last_nid; + + polnid = numa_node_id(); + + /* + * Multi-stage node selection is used in conjunction + * with a periodic migration fault to build a temporal + * task<->page relation. By using a two-stage filter we + * remove short/unlikely relations. + * + * Using P(p) ~ n_p / n_t as per frequentist + * probability, we can equate a task's usage of a + * particular page (n_p) per total usage of this + * page (n_t) (in a given time-span) to a probability. + * + * Our periodic faults will sample this probability and + * getting the same result twice in a row, given these + * samples are fully independent, is then given by + * P(n)^2, provided our sample period is sufficiently + * short compared to the usage pattern. + * + * This quadric squishes small probabilities, making + * it less likely we act on an unlikely task<->page + * relation. + */ + last_nid = page_xchg_last_nid(page, polnid); + if (last_nid != polnid) + goto out; + } + + if (curnid != polnid) + ret = polnid; +out: + mpol_cond_put(pol); + + return ret; +} + static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); @@ -2305,6 +2486,50 @@ void mpol_free_shared_policy(struct shared_policy *p) mutex_unlock(&p->mutex); } +#ifdef CONFIG_NUMA_BALANCING +static bool __initdata numabalancing_override; + +static void __init check_numabalancing_enable(void) +{ + bool numabalancing_default = false; + + if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) + numabalancing_default = true; + + if (nr_node_ids > 1 && !numabalancing_override) { + printk(KERN_INFO "Enabling automatic NUMA balancing. " + "Configure with numa_balancing= or sysctl"); + set_numabalancing_state(numabalancing_default); + } +} + +static int __init setup_numabalancing(char *str) +{ + int ret = 0; + if (!str) + goto out; + numabalancing_override = true; + + if (!strcmp(str, "enable")) { + set_numabalancing_state(true); + ret = 1; + } else if (!strcmp(str, "disable")) { + set_numabalancing_state(false); + ret = 1; + } +out: + if (!ret) + printk(KERN_WARNING "Unable to parse numa_balancing=\n"); + + return ret; +} +__setup("numa_balancing=", setup_numabalancing); +#else +static inline void __init check_numabalancing_enable(void) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + /* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { @@ -2320,6 +2545,15 @@ void __init numa_policy_init(void) sizeof(struct sp_node), 0, SLAB_PANIC, NULL); + for_each_node(nid) { + preferred_node_policy[nid] = (struct mempolicy) { + .refcnt = ATOMIC_INIT(1), + .mode = MPOL_PREFERRED, + .flags = MPOL_F_MOF | MPOL_F_MORON, + .v = { .preferred_node = nid, }, + }; + } + /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or @@ -2346,6 +2580,8 @@ void __init numa_policy_init(void) if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) printk("numa_policy_init: interleaving failed\n"); + + check_numabalancing_enable(); } /* Reset policy of current process to default */ @@ -2362,14 +2598,13 @@ void numa_default_policy(void) * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag * Used only for mpol_parse_str() and mpol_to_str() */ -#define MPOL_LOCAL MPOL_MAX static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", - [MPOL_LOCAL] = "local" + [MPOL_LOCAL] = "local", }; @@ -2415,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (flags) *flags++ = '\0'; /* terminate mode string */ - for (mode = 0; mode <= MPOL_LOCAL; mode++) { + for (mode = 0; mode < MPOL_MAX; mode++) { if (!strcmp(str, policy_modes[mode])) { break; } } - if (mode > MPOL_LOCAL) + if (mode >= MPOL_MAX) goto out; switch (mode) { diff --git a/mm/migrate.c b/mm/migrate.c index cae02711181..32efd8028bc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -39,6 +39,9 @@ #include <asm/tlbflush.h> +#define CREATE_TRACE_POINTS +#include <trace/events/migrate.h> + #include "internal.h" /* @@ -293,7 +296,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, struct buffer_head *head, enum migrate_mode mode) { - int expected_count; + int expected_count = 0; void **pslot; if (!mapping) { @@ -421,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, */ void migrate_page_copy(struct page *newpage, struct page *page) { - if (PageHuge(page)) + if (PageHuge(page) || PageTransHuge(page)) copy_huge_page(newpage, page); else copy_highpage(newpage, page); @@ -765,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, */ if (PageAnon(page)) { /* - * Only page_lock_anon_vma() understands the subtleties of + * Only page_lock_anon_vma_read() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. */ anon_vma = page_get_anon_vma(page); @@ -998,10 +1001,11 @@ out: */ int migrate_pages(struct list_head *from, new_page_t get_new_page, unsigned long private, bool offlining, - enum migrate_mode mode) + enum migrate_mode mode, int reason) { int retry = 1; int nr_failed = 0; + int nr_succeeded = 0; int pass = 0; struct page *page; struct page *page2; @@ -1028,6 +1032,7 @@ int migrate_pages(struct list_head *from, retry++; break; case MIGRATEPAGE_SUCCESS: + nr_succeeded++; break; default: /* Permanent failure */ @@ -1038,6 +1043,12 @@ int migrate_pages(struct list_head *from, } rc = nr_failed + retry; out: + if (nr_succeeded) + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); + if (nr_failed) + count_vm_events(PGMIGRATE_FAIL, nr_failed); + trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); + if (!swapwrite) current->flags &= ~PF_SWAPWRITE; @@ -1176,7 +1187,8 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0, MIGRATE_SYNC); + (unsigned long)pm, 0, MIGRATE_SYNC, + MR_SYSCALL); if (err) putback_lru_pages(&pagelist); } @@ -1440,4 +1452,317 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, } return err; } -#endif + +#ifdef CONFIG_NUMA_BALANCING +/* + * Returns true if this is a safe migration target node for misplaced NUMA + * pages. Currently it only checks the watermarks which crude + */ +static bool migrate_balanced_pgdat(struct pglist_data *pgdat, + int nr_migrate_pages) +{ + int z; + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable) + continue; + + /* Avoid waking kswapd by allocating pages_to_migrate pages. */ + if (!zone_watermark_ok(zone, 0, + high_wmark_pages(zone) + + nr_migrate_pages, + 0, 0)) + continue; + return true; + } + return false; +} + +static struct page *alloc_misplaced_dst_page(struct page *page, + unsigned long data, + int **result) +{ + int nid = (int) data; + struct page *newpage; + + newpage = alloc_pages_exact_node(nid, + (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | + __GFP_NOMEMALLOC | __GFP_NORETRY | + __GFP_NOWARN) & + ~GFP_IOFS, 0); + if (newpage) + page_xchg_last_nid(newpage, page_last_nid(page)); + + return newpage; +} + +/* + * page migration rate limiting control. + * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs + * window of time. Default here says do not migrate more than 1280M per second. + * If a node is rate-limited then PTE NUMA updates are also rate-limited. However + * as it is faults that reset the window, pte updates will happen unconditionally + * if there has not been a fault since @pteupdate_interval_millisecs after the + * throttle window closed. + */ +static unsigned int migrate_interval_millisecs __read_mostly = 100; +static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; +static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); + +/* Returns true if NUMA migration is currently rate limited */ +bool migrate_ratelimited(int node) +{ + pg_data_t *pgdat = NODE_DATA(node); + + if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + + msecs_to_jiffies(pteupdate_interval_millisecs))) + return false; + + if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) + return false; + + return true; +} + +/* Returns true if the node is migrate rate-limited after the update */ +bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) +{ + bool rate_limited = false; + + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + spin_lock(&pgdat->numabalancing_migrate_lock); + if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { + pgdat->numabalancing_migrate_nr_pages = 0; + pgdat->numabalancing_migrate_next_window = jiffies + + msecs_to_jiffies(migrate_interval_millisecs); + } + if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) + rate_limited = true; + else + pgdat->numabalancing_migrate_nr_pages += nr_pages; + spin_unlock(&pgdat->numabalancing_migrate_lock); + + return rate_limited; +} + +int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) +{ + int ret = 0; + + /* Avoid migrating to a node that is nearly full */ + if (migrate_balanced_pgdat(pgdat, 1)) { + int page_lru; + + if (isolate_lru_page(page)) { + put_page(page); + return 0; + } + + /* Page is isolated */ + ret = 1; + page_lru = page_is_file_cache(page); + if (!PageTransHuge(page)) + inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); + else + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, + HPAGE_PMD_NR); + } + + /* + * Page is either isolated or there is not enough space on the target + * node. If isolated, then it has taken a reference count and the + * callers reference can be safely dropped without the page + * disappearing underneath us during migration. Otherwise the page is + * not to be migrated but the callers reference should still be + * dropped so it does not leak. + */ + put_page(page); + + return ret; +} + +/* + * Attempt to migrate a misplaced page to the specified destination + * node. Caller is expected to have an elevated reference count on + * the page that will be dropped by this function before returning. + */ +int migrate_misplaced_page(struct page *page, int node) +{ + pg_data_t *pgdat = NODE_DATA(node); + int isolated = 0; + int nr_remaining; + LIST_HEAD(migratepages); + + /* + * Don't migrate pages that are mapped in multiple processes. + * TODO: Handle false sharing detection instead of this hammer + */ + if (page_mapcount(page) != 1) { + put_page(page); + goto out; + } + + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + if (numamigrate_update_ratelimit(pgdat, 1)) { + put_page(page); + goto out; + } + + isolated = numamigrate_isolate_page(pgdat, page); + if (!isolated) + goto out; + + list_add(&page->lru, &migratepages); + nr_remaining = migrate_pages(&migratepages, + alloc_misplaced_dst_page, + node, false, MIGRATE_ASYNC, + MR_NUMA_MISPLACED); + if (nr_remaining) { + putback_lru_pages(&migratepages); + isolated = 0; + } else + count_vm_numa_event(NUMA_PAGE_MIGRATE); + BUG_ON(!list_empty(&migratepages)); +out: + return isolated; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +int migrate_misplaced_transhuge_page(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, pmd_t entry, + unsigned long address, + struct page *page, int node) +{ + unsigned long haddr = address & HPAGE_PMD_MASK; + pg_data_t *pgdat = NODE_DATA(node); + int isolated = 0; + struct page *new_page = NULL; + struct mem_cgroup *memcg = NULL; + int page_lru = page_is_file_cache(page); + + /* + * Don't migrate pages that are mapped in multiple processes. + * TODO: Handle false sharing detection instead of this hammer + */ + if (page_mapcount(page) != 1) + goto out_dropref; + + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) + goto out_dropref; + + new_page = alloc_pages_node(node, + (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); + if (!new_page) { + count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); + goto out_dropref; + } + page_xchg_last_nid(new_page, page_last_nid(page)); + + isolated = numamigrate_isolate_page(pgdat, page); + if (!isolated) { + count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); + put_page(new_page); + goto out_keep_locked; + } + + /* Prepare a page as a migration target */ + __set_page_locked(new_page); + SetPageSwapBacked(new_page); + + /* anon mapping, we can simply copy page->mapping to the new page: */ + new_page->mapping = page->mapping; + new_page->index = page->index; + migrate_page_copy(new_page, page); + WARN_ON(PageLRU(new_page)); + + /* Recheck the target PMD */ + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, entry))) { + spin_unlock(&mm->page_table_lock); + + /* Reverse changes made by migrate_page_copy() */ + if (TestClearPageActive(new_page)) + SetPageActive(page); + if (TestClearPageUnevictable(new_page)) + SetPageUnevictable(page); + mlock_migrate_page(page, new_page); + + unlock_page(new_page); + put_page(new_page); /* Free it */ + + unlock_page(page); + putback_lru_page(page); + + count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); + goto out; + } + + /* + * Traditional migration needs to prepare the memcg charge + * transaction early to prevent the old page from being + * uncharged when installing migration entries. Here we can + * save the potential rollback and start the charge transfer + * only when migration is already known to end successfully. + */ + mem_cgroup_prepare_migration(page, new_page, &memcg); + + entry = mk_pmd(new_page, vma->vm_page_prot); + entry = pmd_mknonnuma(entry); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + + page_add_new_anon_rmap(new_page, vma, haddr); + + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, address, entry); + page_remove_rmap(page); + /* + * Finish the charge transaction under the page table lock to + * prevent split_huge_page() from dividing up the charge + * before it's fully transferred to the new page. + */ + mem_cgroup_end_migration(memcg, page, new_page, true); + spin_unlock(&mm->page_table_lock); + + unlock_page(new_page); + unlock_page(page); + put_page(page); /* Drop the rmap reference */ + put_page(page); /* Drop the LRU isolation reference */ + + count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); + count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); + +out: + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, + -HPAGE_PMD_NR); + return isolated; + +out_dropref: + put_page(page); +out_keep_locked: + return 0; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#endif /* CONFIG_NUMA */ diff --git a/mm/mmap.c b/mm/mmap.c index 2b7d9e78a56..f54b235f29a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -736,7 +736,7 @@ again: remove_next = 1 + (end > next->vm_end); if (anon_vma) { VM_BUG_ON(adjust_next && next->anon_vma && anon_vma != next->anon_vma); - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_interval_tree_pre_update_vma(vma); if (adjust_next) anon_vma_interval_tree_pre_update_vma(next); @@ -2886,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); + down_write(&anon_vma->root->rwsem); /* * We can safely modify head.next after taking the - * anon_vma->root->mutex. If some other vma in this mm shares + * anon_vma->root->rwsem. If some other vma in this mm shares * the same anon_vma we won't take it again. * * No need of atomic instructions here, head.next * can't change from under us thanks to the - * anon_vma->root->mutex. + * anon_vma->root->rwsem. */ if (__test_and_set_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) @@ -2996,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * * No need of atomic instructions here, head.next * can't change from under us until we release the - * anon_vma->root->mutex. + * anon_vma->root->rwsem. */ if (!__test_and_clear_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) diff --git a/mm/mprotect.c b/mm/mprotect.c index e8c3938db6f..3dca970367d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) } #endif -static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, +static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa, bool *ret_all_same_node) { + struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; + unsigned long pages = 0; + bool all_same_node = true; + int last_nid = -1; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; + bool updated = false; ptent = ptep_modify_prot_start(mm, addr, pte); - ptent = pte_modify(ptent, newprot); + if (!prot_numa) { + ptent = pte_modify(ptent, newprot); + updated = true; + } else { + struct page *page; + + page = vm_normal_page(vma, addr, oldpte); + if (page) { + int this_nid = page_to_nid(page); + if (last_nid == -1) + last_nid = this_nid; + if (last_nid != this_nid) + all_same_node = false; + + /* only check non-shared pages */ + if (!pte_numa(oldpte) && + page_mapcount(page) == 1) { + ptent = pte_mknuma(ptent); + updated = true; + } + } + } /* * Avoid taking write faults for pages we know to be * dirty. */ - if (dirty_accountable && pte_dirty(ptent)) + if (dirty_accountable && pte_dirty(ptent)) { ptent = pte_mkwrite(ptent); + updated = true; + } + if (updated) + pages++; ptep_modify_prot_commit(mm, addr, pte, ptent); } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -72,18 +102,40 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, set_pte_at(mm, addr, pte, swp_entry_to_pte(entry)); } + pages++; } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); + + *ret_all_same_node = all_same_node; + return pages; } -static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, +#ifdef CONFIG_NUMA_BALANCING +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd) +{ + spin_lock(&mm->page_table_lock); + set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); + spin_unlock(&mm->page_table_lock); +} +#else +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd) +{ + BUG(); +} +#endif /* CONFIG_NUMA_BALANCING */ + +static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { pmd_t *pmd; unsigned long next; + unsigned long pages = 0; + bool all_same_node; pmd = pmd_offset(pud, addr); do { @@ -91,42 +143,59 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); - else if (change_huge_pmd(vma, pmd, addr, newprot)) + else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { + pages += HPAGE_PMD_NR; continue; + } /* fall through */ } if (pmd_none_or_clear_bad(pmd)) continue; - change_pte_range(vma->vm_mm, pmd, addr, next, newprot, - dirty_accountable); + pages += change_pte_range(vma, pmd, addr, next, newprot, + dirty_accountable, prot_numa, &all_same_node); + + /* + * If we are changing protections for NUMA hinting faults then + * set pmd_numa if the examined pages were all on the same + * node. This allows a regular PMD to be handled as one fault + * and effectively batches the taking of the PTL + */ + if (prot_numa && all_same_node) + change_pmd_protnuma(vma->vm_mm, addr, pmd); } while (pmd++, addr = next, addr != end); + + return pages; } -static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { pud_t *pud; unsigned long next; + unsigned long pages = 0; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - change_pmd_range(vma, pud, addr, next, newprot, - dirty_accountable); + pages += change_pmd_range(vma, pud, addr, next, newprot, + dirty_accountable, prot_numa); } while (pud++, addr = next, addr != end); + + return pages; } -static void change_protection(struct vm_area_struct *vma, +static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable) + int dirty_accountable, int prot_numa) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; unsigned long start = addr; + unsigned long pages = 0; BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); @@ -135,10 +204,32 @@ static void change_protection(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - change_pud_range(vma, pgd, addr, next, newprot, - dirty_accountable); + pages += change_pud_range(vma, pgd, addr, next, newprot, + dirty_accountable, prot_numa); } while (pgd++, addr = next, addr != end); - flush_tlb_range(vma, start, end); + + /* Only flush the TLB if we actually modified any entries: */ + if (pages) + flush_tlb_range(vma, start, end); + + return pages; +} + +unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot, + int dirty_accountable, int prot_numa) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long pages; + + mmu_notifier_invalidate_range_start(mm, start, end); + if (is_vm_hugetlb_page(vma)) + pages = hugetlb_change_protection(vma, start, end, newprot); + else + pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); + mmu_notifier_invalidate_range_end(mm, start, end); + + return pages; } int @@ -213,12 +304,8 @@ success: dirty_accountable = 1; } - mmu_notifier_invalidate_range_start(mm, start, end); - if (is_vm_hugetlb_page(vma)) - hugetlb_change_protection(vma, start, end, vma->vm_page_prot); - else - change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); - mmu_notifier_invalidate_range_end(mm, start, end); + change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); perf_event_mmap(vma); diff --git a/mm/mremap.c b/mm/mremap.c index eabb24da6c9..e1031e1f6a6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, } if (vma->anon_vma) { anon_vma = vma->anon_vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83637dfba11..d037c8bc151 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -611,6 +611,7 @@ static inline int free_pages_check(struct page *page) bad_page(page); return 1; } + reset_page_last_nid(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -3883,6 +3884,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); reset_page_mapcount(page); + reset_page_last_nid(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for @@ -4526,6 +4528,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, int ret; pgdat_resize_init(pgdat); +#ifdef CONFIG_NUMA_BALANCING + spin_lock_init(&pgdat->numabalancing_migrate_lock); + pgdat->numabalancing_migrate_nr_pages = 0; + pgdat->numabalancing_migrate_next_window = jiffies; +#endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat_page_cgroup_init(pgdat); @@ -5800,7 +5807,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - 0, false, MIGRATE_SYNC); + 0, false, MIGRATE_SYNC, + MR_CMA); } putback_movable_pages(&cc->migratepages); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e642627da6b..0c8323fe6c8 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -12,8 +12,8 @@ #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* - * Only sets the access flags (dirty, accessed, and - * writable). Furthermore, we know it always gets set to a "more + * Only sets the access flags (dirty, accessed), as well as write + * permission. Furthermore, we know it always gets set to a "more * permissive" setting, which allows most architectures to optimize * this. We return whether the PTE actually changed, which in turn * instructs the caller to do things like update__mmu_cache. This @@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, int changed = !pte_same(*ptep, entry); if (changed) { set_pte_at(vma->vm_mm, address, ptep, entry); - flush_tlb_page(vma, address); + flush_tlb_fix_spurious_fault(vma, address); } return changed; } @@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, { pte_t pte; pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); - flush_tlb_page(vma, address); + if (pte_accessible(pte)) + flush_tlb_page(vma, address); return pte; } #endif diff --git a/mm/rmap.c b/mm/rmap.c index face808a489..2c78f8cadc9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -24,7 +24,7 @@ * mm->mmap_sem * page->flags PG_locked (lock_page) * mapping->i_mmap_mutex - * anon_vma->mutex + * anon_vma->rwsem * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) @@ -37,7 +37,7 @@ * in arch-dependent flush_dcache_mmap_lock, * within bdi.wb->list_lock in __sync_single_inode) * - * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) + * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock */ @@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) VM_BUG_ON(atomic_read(&anon_vma->refcount)); /* - * Synchronize against page_lock_anon_vma() such that + * Synchronize against page_lock_anon_vma_read() such that * we can safely hold the lock without the anon_vma getting * freed. * * Relies on the full mb implied by the atomic_dec_and_test() from * put_anon_vma() against the acquire barrier implied by - * mutex_trylock() from page_lock_anon_vma(). This orders: + * down_read_trylock() from page_lock_anon_vma_read(). This orders: * - * page_lock_anon_vma() VS put_anon_vma() - * mutex_trylock() atomic_dec_and_test() + * page_lock_anon_vma_read() VS put_anon_vma() + * down_read_trylock() atomic_dec_and_test() * LOCK MB - * atomic_read() mutex_is_locked() + * atomic_read() rwsem_is_locked() * * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ - if (mutex_is_locked(&anon_vma->root->mutex)) { - anon_vma_lock(anon_vma); + if (rwsem_is_locked(&anon_vma->root->rwsem)) { + anon_vma_lock_write(anon_vma); anon_vma_unlock(anon_vma); } @@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * allocate a new one. * * Anon-vma allocations are very subtle, because we may have - * optimistically looked up an anon_vma in page_lock_anon_vma() + * optimistically looked up an anon_vma in page_lock_anon_vma_read() * and that may actually touch the spinlock even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). @@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) allocated = anon_vma; } - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { @@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct struct anon_vma *new_root = anon_vma->root; if (new_root != root) { if (WARN_ON_ONCE(root)) - mutex_unlock(&root->mutex); + up_write(&root->rwsem); root = new_root; - mutex_lock(&root->mutex); + down_write(&root->rwsem); } return root; } @@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct static inline void unlock_anon_vma_root(struct anon_vma *root) { if (root) - mutex_unlock(&root->mutex); + up_write(&root->rwsem); } /* @@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; - anon_vma_lock(anon_vma); + anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); anon_vma_unlock(anon_vma); @@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) /* * Iterate the list once more, it now only contains empty and unlinked * anon_vmas, destroy them. Could not do before due to __put_anon_vma() - * needing to acquire the anon_vma->root->mutex. + * needing to write-acquire the anon_vma->root->rwsem. */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; @@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; - mutex_init(&anon_vma->mutex); + init_rwsem(&anon_vma->rwsem); atomic_set(&anon_vma->refcount, 0); anon_vma->rb_root = RB_ROOT; } @@ -442,7 +442,7 @@ out: * atomic op -- the trylock. If we fail the trylock, we fall back to getting a * reference like with page_get_anon_vma() and then block on the mutex. */ -struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma_read(struct page *page) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; @@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page) anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); root_anon_vma = ACCESS_ONCE(anon_vma->root); - if (mutex_trylock(&root_anon_vma->mutex)) { + if (down_read_trylock(&root_anon_vma->rwsem)) { /* * If the page is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will * not go away, see anon_vma_free(). */ if (!page_mapped(page)) { - mutex_unlock(&root_anon_vma->mutex); + up_read(&root_anon_vma->rwsem); anon_vma = NULL; } goto out; @@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page) /* we pinned the anon_vma, its safe to sleep */ rcu_read_unlock(); - anon_vma_lock(anon_vma); + anon_vma_lock_read(anon_vma); if (atomic_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock * and bail -- can't simply use put_anon_vma() because - * we'll deadlock on the anon_vma_lock() recursion. + * we'll deadlock on the anon_vma_lock_write() recursion. */ - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); __put_anon_vma(anon_vma); anon_vma = NULL; } @@ -504,9 +504,9 @@ out: return anon_vma; } -void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma_read(struct anon_vma *anon_vma) { - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); } /* @@ -744,7 +744,7 @@ static int page_referenced_anon(struct page *page, struct anon_vma_chain *avc; int referenced = 0; - anon_vma = page_lock_anon_vma(page); + anon_vma = page_lock_anon_vma_read(page); if (!anon_vma) return referenced; @@ -766,7 +766,7 @@ static int page_referenced_anon(struct page *page, break; } - page_unlock_anon_vma(anon_vma); + page_unlock_anon_vma_read(anon_vma); return referenced; } @@ -1315,7 +1315,7 @@ out_mlock: /* * We need mmap_sem locking, Otherwise VM_LOCKED check makes * unstable result and race. Plus, We can't wait here because - * we now hold anon_vma->mutex or mapping->i_mmap_mutex. + * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. * if trylock failed, the page remain in evictable lru and later * vmscan could retry to move the page to unevictable lru if the * page is actually mlocked. @@ -1480,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - anon_vma = page_lock_anon_vma(page); + anon_vma = page_lock_anon_vma_read(page); if (!anon_vma) return ret; @@ -1507,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) break; } - page_unlock_anon_vma(anon_vma); + page_unlock_anon_vma_read(anon_vma); return ret; } @@ -1702,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, int ret = SWAP_AGAIN; /* - * Note: remove_migration_ptes() cannot use page_lock_anon_vma() + * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages * are holding mmap_sem. Users without mmap_sem are required to * take a reference count to prevent the anon_vma disappearing @@ -1710,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, anon_vma = page_anon_vma(page); if (!anon_vma) return ret; - anon_vma_lock(anon_vma); + anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); @@ -1718,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, if (ret != SWAP_AGAIN) break; } - anon_vma_unlock(anon_vma); + anon_vma_unlock_read(anon_vma); return ret; } diff --git a/mm/vmstat.c b/mm/vmstat.c index df14808f0a3..9800306c819 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -774,10 +774,20 @@ const char * const vmstat_text[] = { "pgrotated", +#ifdef CONFIG_NUMA_BALANCING + "numa_pte_updates", + "numa_hint_faults", + "numa_hint_faults_local", + "numa_pages_migrated", +#endif +#ifdef CONFIG_MIGRATION + "pgmigrate_success", + "pgmigrate_fail", +#endif #ifdef CONFIG_COMPACTION - "compact_blocks_moved", - "compact_pages_moved", - "compact_pagemigrate_failed", + "compact_migrate_scanned", + "compact_free_scanned", + "compact_isolated", "compact_stall", "compact_fail", "compact_success", diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 8aa4b111538..0a69d075779 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -259,20 +259,16 @@ static int __init init_dns_resolver(void) if (!cred) return -ENOMEM; - keyring = key_alloc(&key_type_keyring, ".dns_resolver", - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA); + keyring = keyring_alloc(".dns_resolver", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; } - ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); - if (ret < 0) - goto failed_put_key; - ret = register_key_type(&key_type_dns_resolver); if (ret < 0) goto failed_put_key; @@ -304,3 +300,4 @@ static void __exit exit_dns_resolver(void) module_init(init_dns_resolver) module_exit(exit_dns_resolver) MODULE_LICENSE("GPL"); + diff --git a/security/keys/key.c b/security/keys/key.c index a15c9da8f97..8fb7c7bd465 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -854,13 +854,13 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, /* if the client doesn't provide, decide on the permissions we want */ if (perm == KEY_PERM_UNDEF) { perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; - perm |= KEY_USR_VIEW | KEY_USR_SEARCH | KEY_USR_LINK | KEY_USR_SETATTR; + perm |= KEY_USR_VIEW; if (ktype->read) - perm |= KEY_POS_READ | KEY_USR_READ; + perm |= KEY_POS_READ; if (ktype == &key_type_keyring || ktype->update) - perm |= KEY_USR_WRITE; + perm |= KEY_POS_WRITE; } /* allocate a new key */ diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 5d34b4e827d..4b5c948eb41 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1132,12 +1132,12 @@ long keyctl_instantiate_key_iov(key_serial_t id, ret = rw_copy_check_uvector(WRITE, _payload_iov, ioc, ARRAY_SIZE(iovstack), iovstack, &iov); if (ret < 0) - return ret; + goto err; if (ret == 0) goto no_payload_free; ret = keyctl_instantiate_key_common(id, iov, ioc, ret, ringid); - +err: if (iov != iovstack) kfree(iov); return ret; @@ -1495,7 +1495,8 @@ long keyctl_session_to_parent(void) goto error_keyring; newwork = &cred->rcu; - cred->tgcred->session_keyring = key_ref_to_ptr(keyring_r); + cred->session_keyring = key_ref_to_ptr(keyring_r); + keyring_r = NULL; init_task_work(newwork, key_change_session_keyring); me = current; @@ -1519,7 +1520,7 @@ long keyctl_session_to_parent(void) mycred = current_cred(); pcred = __task_cred(parent); if (mycred == pcred || - mycred->tgcred->session_keyring == pcred->tgcred->session_keyring) { + mycred->session_keyring == pcred->session_keyring) { ret = 0; goto unlock; } @@ -1535,9 +1536,9 @@ long keyctl_session_to_parent(void) goto unlock; /* the keyrings must have the same UID */ - if ((pcred->tgcred->session_keyring && - !uid_eq(pcred->tgcred->session_keyring->uid, mycred->euid)) || - !uid_eq(mycred->tgcred->session_keyring->uid, mycred->euid)) + if ((pcred->session_keyring && + !uid_eq(pcred->session_keyring->uid, mycred->euid)) || + !uid_eq(mycred->session_keyring->uid, mycred->euid)) goto unlock; /* cancel an already pending keyring replacement */ diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 6e42df15a24..6ece7f2e570 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -257,17 +257,14 @@ error: * Allocate a keyring and link into the destination keyring. */ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, - const struct cred *cred, unsigned long flags, - struct key *dest) + const struct cred *cred, key_perm_t perm, + unsigned long flags, struct key *dest) { struct key *keyring; int ret; keyring = key_alloc(&key_type_keyring, description, - uid, gid, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL, - flags); - + uid, gid, cred, perm, flags); if (!IS_ERR(keyring)) { ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL); if (ret < 0) { @@ -278,6 +275,7 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid, return keyring; } +EXPORT_SYMBOL(keyring_alloc); /** * keyring_search_aux - Search a keyring tree for a key matching some criteria diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 86468f385fc..58dfe089094 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -45,10 +45,12 @@ int install_user_keyrings(void) struct user_struct *user; const struct cred *cred; struct key *uid_keyring, *session_keyring; + key_perm_t user_keyring_perm; char buf[20]; int ret; uid_t uid; + user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL; cred = current_cred(); user = cred->user; uid = from_kuid(cred->user_ns, user->uid); @@ -73,8 +75,8 @@ int install_user_keyrings(void) uid_keyring = find_keyring_by_name(buf, true); if (IS_ERR(uid_keyring)) { uid_keyring = keyring_alloc(buf, user->uid, INVALID_GID, - cred, KEY_ALLOC_IN_QUOTA, - NULL); + cred, user_keyring_perm, + KEY_ALLOC_IN_QUOTA, NULL); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); goto error; @@ -89,7 +91,8 @@ int install_user_keyrings(void) if (IS_ERR(session_keyring)) { session_keyring = keyring_alloc(buf, user->uid, INVALID_GID, - cred, KEY_ALLOC_IN_QUOTA, NULL); + cred, user_keyring_perm, + KEY_ALLOC_IN_QUOTA, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); goto error_release; @@ -130,6 +133,7 @@ int install_thread_keyring_to_cred(struct cred *new) struct key *keyring; keyring = keyring_alloc("_tid", new->uid, new->gid, new, + KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); @@ -170,27 +174,18 @@ static int install_thread_keyring(void) int install_process_keyring_to_cred(struct cred *new) { struct key *keyring; - int ret; - if (new->tgcred->process_keyring) + if (new->process_keyring) return -EEXIST; - keyring = keyring_alloc("_pid", new->uid, new->gid, - new, KEY_ALLOC_QUOTA_OVERRUN, NULL); + keyring = keyring_alloc("_pid", new->uid, new->gid, new, + KEY_POS_ALL | KEY_USR_VIEW, + KEY_ALLOC_QUOTA_OVERRUN, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); - spin_lock_irq(&new->tgcred->lock); - if (!new->tgcred->process_keyring) { - new->tgcred->process_keyring = keyring; - keyring = NULL; - ret = 0; - } else { - ret = -EEXIST; - } - spin_unlock_irq(&new->tgcred->lock); - key_put(keyring); - return ret; + new->process_keyring = keyring; + return 0; } /* @@ -231,11 +226,12 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) /* create an empty session keyring */ if (!keyring) { flags = KEY_ALLOC_QUOTA_OVERRUN; - if (cred->tgcred->session_keyring) + if (cred->session_keyring) flags = KEY_ALLOC_IN_QUOTA; - keyring = keyring_alloc("_ses", cred->uid, cred->gid, - cred, flags, NULL); + keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred, + KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, + flags, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); } else { @@ -243,17 +239,11 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) } /* install the keyring */ - spin_lock_irq(&cred->tgcred->lock); - old = cred->tgcred->session_keyring; - rcu_assign_pointer(cred->tgcred->session_keyring, keyring); - spin_unlock_irq(&cred->tgcred->lock); - - /* we're using RCU on the pointer, but there's no point synchronising - * on it if it didn't previously point to anything */ - if (old) { - synchronize_rcu(); + old = cred->session_keyring; + rcu_assign_pointer(cred->session_keyring, keyring); + + if (old) key_put(old); - } return 0; } @@ -368,9 +358,9 @@ key_ref_t search_my_process_keyrings(struct key_type *type, } /* search the process keyring second */ - if (cred->tgcred->process_keyring) { + if (cred->process_keyring) { key_ref = keyring_search_aux( - make_key_ref(cred->tgcred->process_keyring, 1), + make_key_ref(cred->process_keyring, 1), cred, type, description, match, no_state_check); if (!IS_ERR(key_ref)) goto found; @@ -389,12 +379,10 @@ key_ref_t search_my_process_keyrings(struct key_type *type, } /* search the session keyring */ - if (cred->tgcred->session_keyring) { + if (cred->session_keyring) { rcu_read_lock(); key_ref = keyring_search_aux( - make_key_ref(rcu_dereference( - cred->tgcred->session_keyring), - 1), + make_key_ref(rcu_dereference(cred->session_keyring), 1), cred, type, description, match, no_state_check); rcu_read_unlock(); @@ -564,7 +552,7 @@ try_again: break; case KEY_SPEC_PROCESS_KEYRING: - if (!cred->tgcred->process_keyring) { + if (!cred->process_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; @@ -576,13 +564,13 @@ try_again: goto reget_creds; } - key = cred->tgcred->process_keyring; + key = cred->process_keyring; atomic_inc(&key->usage); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_SESSION_KEYRING: - if (!cred->tgcred->session_keyring) { + if (!cred->session_keyring) { /* always install a session keyring upon access if one * doesn't exist yet */ ret = install_user_keyrings(); @@ -597,7 +585,7 @@ try_again: if (ret < 0) goto error; goto reget_creds; - } else if (cred->tgcred->session_keyring == + } else if (cred->session_keyring == cred->user->session_keyring && lflags & KEY_LOOKUP_CREATE) { ret = join_session_keyring(NULL); @@ -607,7 +595,7 @@ try_again: } rcu_read_lock(); - key = rcu_dereference(cred->tgcred->session_keyring); + key = rcu_dereference(cred->session_keyring); atomic_inc(&key->usage); rcu_read_unlock(); key_ref = make_key_ref(key, 1); @@ -767,12 +755,6 @@ long join_session_keyring(const char *name) struct key *keyring; long ret, serial; - /* only permit this if there's a single thread in the thread group - - * this avoids us having to adjust the creds on all threads and risking - * ENOMEM */ - if (!current_is_single_threaded()) - return -EMLINK; - new = prepare_creds(); if (!new) return -ENOMEM; @@ -784,7 +766,7 @@ long join_session_keyring(const char *name) if (ret < 0) goto error; - serial = new->tgcred->session_keyring->serial; + serial = new->session_keyring->serial; ret = commit_creds(new); if (ret == 0) ret = serial; @@ -798,8 +780,10 @@ long join_session_keyring(const char *name) keyring = find_keyring_by_name(name, false); if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ - keyring = keyring_alloc(name, old->uid, old->gid, old, - KEY_ALLOC_IN_QUOTA, NULL); + keyring = keyring_alloc( + name, old->uid, old->gid, old, + KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK, + KEY_ALLOC_IN_QUOTA, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; @@ -807,6 +791,9 @@ long join_session_keyring(const char *name) } else if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; + } else if (keyring == new->session_keyring) { + ret = 0; + goto error2; } /* we've got a keyring - now to install it */ @@ -863,8 +850,7 @@ void key_change_session_keyring(struct callback_head *twork) new->jit_keyring = old->jit_keyring; new->thread_keyring = key_get(old->thread_keyring); - new->tgcred->tgid = old->tgcred->tgid; - new->tgcred->process_keyring = key_get(old->tgcred->process_keyring); + new->process_keyring = key_get(old->process_keyring); security_transfer_creds(new, old); diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 66e21184b55..4bd6bdb7419 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -126,6 +126,7 @@ static int call_sbin_request_key(struct key_construction *cons, cred = get_current_cred(); keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred, + KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_QUOTA_OVERRUN, NULL); put_cred(cred); if (IS_ERR(keyring)) { @@ -150,12 +151,12 @@ static int call_sbin_request_key(struct key_construction *cons, cred->thread_keyring ? cred->thread_keyring->serial : 0); prkey = 0; - if (cred->tgcred->process_keyring) - prkey = cred->tgcred->process_keyring->serial; + if (cred->process_keyring) + prkey = cred->process_keyring->serial; sprintf(keyring_str[1], "%d", prkey); rcu_read_lock(); - session = rcu_dereference(cred->tgcred->session_keyring); + session = rcu_dereference(cred->session_keyring); if (!session) session = cred->user->session_keyring; sskey = session->serial; @@ -297,14 +298,14 @@ static void construct_get_dest_keyring(struct key **_dest_keyring) break; case KEY_REQKEY_DEFL_PROCESS_KEYRING: - dest_keyring = key_get(cred->tgcred->process_keyring); + dest_keyring = key_get(cred->process_keyring); if (dest_keyring) break; case KEY_REQKEY_DEFL_SESSION_KEYRING: rcu_read_lock(); dest_keyring = key_get( - rcu_dereference(cred->tgcred->session_keyring)); + rcu_dereference(cred->session_keyring)); rcu_read_unlock(); if (dest_keyring) @@ -347,6 +348,7 @@ static int construct_alloc_key(struct key_type *type, const struct cred *cred = current_cred(); unsigned long prealloc; struct key *key; + key_perm_t perm; key_ref_t key_ref; int ret; @@ -355,8 +357,15 @@ static int construct_alloc_key(struct key_type *type, *_key = NULL; mutex_lock(&user->cons_lock); + perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; + perm |= KEY_USR_VIEW; + if (type->read) + perm |= KEY_POS_READ; + if (type == &key_type_keyring || type->update) + perm |= KEY_POS_WRITE; + key = key_alloc(type, description, cred->fsuid, cred->fsgid, cred, - KEY_POS_ALL, flags); + perm, flags); if (IS_ERR(key)) goto alloc_failed; diff --git a/security/smack/Kconfig b/security/smack/Kconfig index 603b0878434..e69de9c642b 100644 --- a/security/smack/Kconfig +++ b/security/smack/Kconfig @@ -1,6 +1,10 @@ config SECURITY_SMACK bool "Simplified Mandatory Access Control Kernel Support" - depends on NETLABEL && SECURITY_NETWORK + depends on NET + depends on INET + depends on SECURITY + select NETLABEL + select SECURITY_NETWORK default n help This selects the Simplified Mandatory Access Control Kernel. diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 99929a50093..76a5dca4640 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -2063,6 +2063,19 @@ static const struct file_operations smk_revoke_subj_ops = { .llseek = generic_file_llseek, }; +static struct kset *smackfs_kset; +/** + * smk_init_sysfs - initialize /sys/fs/smackfs + * + */ +static int smk_init_sysfs(void) +{ + smackfs_kset = kset_create_and_add("smackfs", NULL, fs_kobj); + if (!smackfs_kset) + return -ENOMEM; + return 0; +} + /** * smk_fill_super - fill the /smackfs superblock * @sb: the empty superblock @@ -2183,6 +2196,10 @@ static int __init init_smk_fs(void) if (!security_module_enable(&smack_ops)) return 0; + err = smk_init_sysfs(); + if (err) + printk(KERN_ERR "smackfs: sysfs mountpoint problem.\n"); + err = register_filesystem(&smk_fs_type); if (!err) { smackfs_mount = kern_mount(&smk_fs_type); diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index b4c29848b49..2663145d119 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -17,6 +17,7 @@ #include <linux/ptrace.h> #include <linux/prctl.h> #include <linux/ratelimit.h> +#include <linux/workqueue.h> #define YAMA_SCOPE_DISABLED 0 #define YAMA_SCOPE_RELATIONAL 1 @@ -29,12 +30,37 @@ static int ptrace_scope = YAMA_SCOPE_RELATIONAL; struct ptrace_relation { struct task_struct *tracer; struct task_struct *tracee; + bool invalid; struct list_head node; + struct rcu_head rcu; }; static LIST_HEAD(ptracer_relations); static DEFINE_SPINLOCK(ptracer_relations_lock); +static void yama_relation_cleanup(struct work_struct *work); +static DECLARE_WORK(yama_relation_work, yama_relation_cleanup); + +/** + * yama_relation_cleanup - remove invalid entries from the relation list + * + */ +static void yama_relation_cleanup(struct work_struct *work) +{ + struct ptrace_relation *relation; + + spin_lock(&ptracer_relations_lock); + rcu_read_lock(); + list_for_each_entry_rcu(relation, &ptracer_relations, node) { + if (relation->invalid) { + list_del_rcu(&relation->node); + kfree_rcu(relation, rcu); + } + } + rcu_read_unlock(); + spin_unlock(&ptracer_relations_lock); +} + /** * yama_ptracer_add - add/replace an exception for this tracer/tracee pair * @tracer: the task_struct of the process doing the ptrace @@ -48,32 +74,34 @@ static DEFINE_SPINLOCK(ptracer_relations_lock); static int yama_ptracer_add(struct task_struct *tracer, struct task_struct *tracee) { - int rc = 0; - struct ptrace_relation *added; - struct ptrace_relation *entry, *relation = NULL; + struct ptrace_relation *relation, *added; added = kmalloc(sizeof(*added), GFP_KERNEL); if (!added) return -ENOMEM; - spin_lock_bh(&ptracer_relations_lock); - list_for_each_entry(entry, &ptracer_relations, node) - if (entry->tracee == tracee) { - relation = entry; - break; + added->tracee = tracee; + added->tracer = tracer; + added->invalid = false; + + spin_lock(&ptracer_relations_lock); + rcu_read_lock(); + list_for_each_entry_rcu(relation, &ptracer_relations, node) { + if (relation->invalid) + continue; + if (relation->tracee == tracee) { + list_replace_rcu(&relation->node, &added->node); + kfree_rcu(relation, rcu); + goto out; } - if (!relation) { - relation = added; - relation->tracee = tracee; - list_add(&relation->node, &ptracer_relations); } - relation->tracer = tracer; - spin_unlock_bh(&ptracer_relations_lock); - if (added != relation) - kfree(added); + list_add_rcu(&added->node, &ptracer_relations); - return rc; +out: + rcu_read_unlock(); + spin_unlock(&ptracer_relations_lock); + return 0; } /** @@ -84,16 +112,23 @@ static int yama_ptracer_add(struct task_struct *tracer, static void yama_ptracer_del(struct task_struct *tracer, struct task_struct *tracee) { - struct ptrace_relation *relation, *safe; + struct ptrace_relation *relation; + bool marked = false; - spin_lock_bh(&ptracer_relations_lock); - list_for_each_entry_safe(relation, safe, &ptracer_relations, node) + rcu_read_lock(); + list_for_each_entry_rcu(relation, &ptracer_relations, node) { + if (relation->invalid) + continue; if (relation->tracee == tracee || (tracer && relation->tracer == tracer)) { - list_del(&relation->node); - kfree(relation); + relation->invalid = true; + marked = true; } - spin_unlock_bh(&ptracer_relations_lock); + } + rcu_read_unlock(); + + if (marked) + schedule_work(&yama_relation_work); } /** @@ -217,21 +252,22 @@ static int ptracer_exception_found(struct task_struct *tracer, struct task_struct *parent = NULL; bool found = false; - spin_lock_bh(&ptracer_relations_lock); rcu_read_lock(); if (!thread_group_leader(tracee)) tracee = rcu_dereference(tracee->group_leader); - list_for_each_entry(relation, &ptracer_relations, node) + list_for_each_entry_rcu(relation, &ptracer_relations, node) { + if (relation->invalid) + continue; if (relation->tracee == tracee) { parent = relation->tracer; found = true; break; } + } if (found && (parent == NULL || task_is_descendant(parent, tracer))) rc = 1; rcu_read_unlock(); - spin_unlock_bh(&ptracer_relations_lock); return rc; } |