aboutsummaryrefslogtreecommitdiff
path: root/drivers/net/e1000/e1000_hw.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/e1000/e1000_hw.c')
-rw-r--r--drivers/net/e1000/e1000_hw.c5821
1 files changed, 0 insertions, 5821 deletions
diff --git a/drivers/net/e1000/e1000_hw.c b/drivers/net/e1000/e1000_hw.c
deleted file mode 100644
index 1698622af43..00000000000
--- a/drivers/net/e1000/e1000_hw.c
+++ /dev/null
@@ -1,5821 +0,0 @@
-/*******************************************************************************
-
- Intel PRO/1000 Linux driver
- Copyright(c) 1999 - 2006 Intel Corporation.
-
- This program is free software; you can redistribute it and/or modify it
- under the terms and conditions of the GNU General Public License,
- version 2, as published by the Free Software Foundation.
-
- This program is distributed in the hope it will be useful, but WITHOUT
- ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- more details.
-
- You should have received a copy of the GNU General Public License along with
- this program; if not, write to the Free Software Foundation, Inc.,
- 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
-
- The full GNU General Public License is included in this distribution in
- the file called "COPYING".
-
- Contact Information:
- Linux NICS <linux.nics@intel.com>
- e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
- Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
-
- */
-
-/* e1000_hw.c
- * Shared functions for accessing and configuring the MAC
- */
-
-#include "e1000.h"
-
-static s32 e1000_check_downshift(struct e1000_hw *hw);
-static s32 e1000_check_polarity(struct e1000_hw *hw,
- e1000_rev_polarity *polarity);
-static void e1000_clear_hw_cntrs(struct e1000_hw *hw);
-static void e1000_clear_vfta(struct e1000_hw *hw);
-static s32 e1000_config_dsp_after_link_change(struct e1000_hw *hw,
- bool link_up);
-static s32 e1000_config_fc_after_link_up(struct e1000_hw *hw);
-static s32 e1000_detect_gig_phy(struct e1000_hw *hw);
-static s32 e1000_get_auto_rd_done(struct e1000_hw *hw);
-static s32 e1000_get_cable_length(struct e1000_hw *hw, u16 *min_length,
- u16 *max_length);
-static s32 e1000_get_phy_cfg_done(struct e1000_hw *hw);
-static s32 e1000_id_led_init(struct e1000_hw *hw);
-static void e1000_init_rx_addrs(struct e1000_hw *hw);
-static s32 e1000_phy_igp_get_info(struct e1000_hw *hw,
- struct e1000_phy_info *phy_info);
-static s32 e1000_phy_m88_get_info(struct e1000_hw *hw,
- struct e1000_phy_info *phy_info);
-static s32 e1000_set_d3_lplu_state(struct e1000_hw *hw, bool active);
-static s32 e1000_wait_autoneg(struct e1000_hw *hw);
-static void e1000_write_reg_io(struct e1000_hw *hw, u32 offset, u32 value);
-static s32 e1000_set_phy_type(struct e1000_hw *hw);
-static void e1000_phy_init_script(struct e1000_hw *hw);
-static s32 e1000_setup_copper_link(struct e1000_hw *hw);
-static s32 e1000_setup_fiber_serdes_link(struct e1000_hw *hw);
-static s32 e1000_adjust_serdes_amplitude(struct e1000_hw *hw);
-static s32 e1000_phy_force_speed_duplex(struct e1000_hw *hw);
-static s32 e1000_config_mac_to_phy(struct e1000_hw *hw);
-static void e1000_raise_mdi_clk(struct e1000_hw *hw, u32 *ctrl);
-static void e1000_lower_mdi_clk(struct e1000_hw *hw, u32 *ctrl);
-static void e1000_shift_out_mdi_bits(struct e1000_hw *hw, u32 data, u16 count);
-static u16 e1000_shift_in_mdi_bits(struct e1000_hw *hw);
-static s32 e1000_phy_reset_dsp(struct e1000_hw *hw);
-static s32 e1000_write_eeprom_spi(struct e1000_hw *hw, u16 offset,
- u16 words, u16 *data);
-static s32 e1000_write_eeprom_microwire(struct e1000_hw *hw, u16 offset,
- u16 words, u16 *data);
-static s32 e1000_spi_eeprom_ready(struct e1000_hw *hw);
-static void e1000_raise_ee_clk(struct e1000_hw *hw, u32 *eecd);
-static void e1000_lower_ee_clk(struct e1000_hw *hw, u32 *eecd);
-static void e1000_shift_out_ee_bits(struct e1000_hw *hw, u16 data, u16 count);
-static s32 e1000_write_phy_reg_ex(struct e1000_hw *hw, u32 reg_addr,
- u16 phy_data);
-static s32 e1000_read_phy_reg_ex(struct e1000_hw *hw, u32 reg_addr,
- u16 *phy_data);
-static u16 e1000_shift_in_ee_bits(struct e1000_hw *hw, u16 count);
-static s32 e1000_acquire_eeprom(struct e1000_hw *hw);
-static void e1000_release_eeprom(struct e1000_hw *hw);
-static void e1000_standby_eeprom(struct e1000_hw *hw);
-static s32 e1000_set_vco_speed(struct e1000_hw *hw);
-static s32 e1000_polarity_reversal_workaround(struct e1000_hw *hw);
-static s32 e1000_set_phy_mode(struct e1000_hw *hw);
-static s32 e1000_do_read_eeprom(struct e1000_hw *hw, u16 offset, u16 words,
- u16 *data);
-static s32 e1000_do_write_eeprom(struct e1000_hw *hw, u16 offset, u16 words,
- u16 *data);
-
-/* IGP cable length table */
-static const
-u16 e1000_igp_cable_length_table[IGP01E1000_AGC_LENGTH_TABLE_SIZE] = {
- 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
- 5, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 25, 25, 25,
- 25, 25, 25, 25, 30, 30, 30, 30, 40, 40, 40, 40, 40, 40, 40, 40,
- 40, 50, 50, 50, 50, 50, 50, 50, 60, 60, 60, 60, 60, 60, 60, 60,
- 60, 70, 70, 70, 70, 70, 70, 80, 80, 80, 80, 80, 80, 90, 90, 90,
- 90, 90, 90, 90, 90, 90, 100, 100, 100, 100, 100, 100, 100, 100, 100,
- 100,
- 100, 100, 100, 100, 110, 110, 110, 110, 110, 110, 110, 110, 110, 110,
- 110, 110,
- 110, 110, 110, 110, 110, 110, 120, 120, 120, 120, 120, 120, 120, 120,
- 120, 120
-};
-
-static DEFINE_SPINLOCK(e1000_eeprom_lock);
-
-/**
- * e1000_set_phy_type - Set the phy type member in the hw struct.
- * @hw: Struct containing variables accessed by shared code
- */
-static s32 e1000_set_phy_type(struct e1000_hw *hw)
-{
- e_dbg("e1000_set_phy_type");
-
- if (hw->mac_type == e1000_undefined)
- return -E1000_ERR_PHY_TYPE;
-
- switch (hw->phy_id) {
- case M88E1000_E_PHY_ID:
- case M88E1000_I_PHY_ID:
- case M88E1011_I_PHY_ID:
- case M88E1111_I_PHY_ID:
- case M88E1118_E_PHY_ID:
- hw->phy_type = e1000_phy_m88;
- break;
- case IGP01E1000_I_PHY_ID:
- if (hw->mac_type == e1000_82541 ||
- hw->mac_type == e1000_82541_rev_2 ||
- hw->mac_type == e1000_82547 ||
- hw->mac_type == e1000_82547_rev_2)
- hw->phy_type = e1000_phy_igp;
- break;
- case RTL8211B_PHY_ID:
- hw->phy_type = e1000_phy_8211;
- break;
- case RTL8201N_PHY_ID:
- hw->phy_type = e1000_phy_8201;
- break;
- default:
- /* Should never have loaded on this device */
- hw->phy_type = e1000_phy_undefined;
- return -E1000_ERR_PHY_TYPE;
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_phy_init_script - IGP phy init script - initializes the GbE PHY
- * @hw: Struct containing variables accessed by shared code
- */
-static void e1000_phy_init_script(struct e1000_hw *hw)
-{
- u32 ret_val;
- u16 phy_saved_data;
-
- e_dbg("e1000_phy_init_script");
-
- if (hw->phy_init_script) {
- msleep(20);
-
- /* Save off the current value of register 0x2F5B to be restored at
- * the end of this routine. */
- ret_val = e1000_read_phy_reg(hw, 0x2F5B, &phy_saved_data);
-
- /* Disabled the PHY transmitter */
- e1000_write_phy_reg(hw, 0x2F5B, 0x0003);
- msleep(20);
-
- e1000_write_phy_reg(hw, 0x0000, 0x0140);
- msleep(5);
-
- switch (hw->mac_type) {
- case e1000_82541:
- case e1000_82547:
- e1000_write_phy_reg(hw, 0x1F95, 0x0001);
- e1000_write_phy_reg(hw, 0x1F71, 0xBD21);
- e1000_write_phy_reg(hw, 0x1F79, 0x0018);
- e1000_write_phy_reg(hw, 0x1F30, 0x1600);
- e1000_write_phy_reg(hw, 0x1F31, 0x0014);
- e1000_write_phy_reg(hw, 0x1F32, 0x161C);
- e1000_write_phy_reg(hw, 0x1F94, 0x0003);
- e1000_write_phy_reg(hw, 0x1F96, 0x003F);
- e1000_write_phy_reg(hw, 0x2010, 0x0008);
- break;
-
- case e1000_82541_rev_2:
- case e1000_82547_rev_2:
- e1000_write_phy_reg(hw, 0x1F73, 0x0099);
- break;
- default:
- break;
- }
-
- e1000_write_phy_reg(hw, 0x0000, 0x3300);
- msleep(20);
-
- /* Now enable the transmitter */
- e1000_write_phy_reg(hw, 0x2F5B, phy_saved_data);
-
- if (hw->mac_type == e1000_82547) {
- u16 fused, fine, coarse;
-
- /* Move to analog registers page */
- e1000_read_phy_reg(hw,
- IGP01E1000_ANALOG_SPARE_FUSE_STATUS,
- &fused);
-
- if (!(fused & IGP01E1000_ANALOG_SPARE_FUSE_ENABLED)) {
- e1000_read_phy_reg(hw,
- IGP01E1000_ANALOG_FUSE_STATUS,
- &fused);
-
- fine = fused & IGP01E1000_ANALOG_FUSE_FINE_MASK;
- coarse =
- fused & IGP01E1000_ANALOG_FUSE_COARSE_MASK;
-
- if (coarse >
- IGP01E1000_ANALOG_FUSE_COARSE_THRESH) {
- coarse -=
- IGP01E1000_ANALOG_FUSE_COARSE_10;
- fine -= IGP01E1000_ANALOG_FUSE_FINE_1;
- } else if (coarse ==
- IGP01E1000_ANALOG_FUSE_COARSE_THRESH)
- fine -= IGP01E1000_ANALOG_FUSE_FINE_10;
-
- fused =
- (fused & IGP01E1000_ANALOG_FUSE_POLY_MASK) |
- (fine & IGP01E1000_ANALOG_FUSE_FINE_MASK) |
- (coarse &
- IGP01E1000_ANALOG_FUSE_COARSE_MASK);
-
- e1000_write_phy_reg(hw,
- IGP01E1000_ANALOG_FUSE_CONTROL,
- fused);
- e1000_write_phy_reg(hw,
- IGP01E1000_ANALOG_FUSE_BYPASS,
- IGP01E1000_ANALOG_FUSE_ENABLE_SW_CONTROL);
- }
- }
- }
-}
-
-/**
- * e1000_set_mac_type - Set the mac type member in the hw struct.
- * @hw: Struct containing variables accessed by shared code
- */
-s32 e1000_set_mac_type(struct e1000_hw *hw)
-{
- e_dbg("e1000_set_mac_type");
-
- switch (hw->device_id) {
- case E1000_DEV_ID_82542:
- switch (hw->revision_id) {
- case E1000_82542_2_0_REV_ID:
- hw->mac_type = e1000_82542_rev2_0;
- break;
- case E1000_82542_2_1_REV_ID:
- hw->mac_type = e1000_82542_rev2_1;
- break;
- default:
- /* Invalid 82542 revision ID */
- return -E1000_ERR_MAC_TYPE;
- }
- break;
- case E1000_DEV_ID_82543GC_FIBER:
- case E1000_DEV_ID_82543GC_COPPER:
- hw->mac_type = e1000_82543;
- break;
- case E1000_DEV_ID_82544EI_COPPER:
- case E1000_DEV_ID_82544EI_FIBER:
- case E1000_DEV_ID_82544GC_COPPER:
- case E1000_DEV_ID_82544GC_LOM:
- hw->mac_type = e1000_82544;
- break;
- case E1000_DEV_ID_82540EM:
- case E1000_DEV_ID_82540EM_LOM:
- case E1000_DEV_ID_82540EP:
- case E1000_DEV_ID_82540EP_LOM:
- case E1000_DEV_ID_82540EP_LP:
- hw->mac_type = e1000_82540;
- break;
- case E1000_DEV_ID_82545EM_COPPER:
- case E1000_DEV_ID_82545EM_FIBER:
- hw->mac_type = e1000_82545;
- break;
- case E1000_DEV_ID_82545GM_COPPER:
- case E1000_DEV_ID_82545GM_FIBER:
- case E1000_DEV_ID_82545GM_SERDES:
- hw->mac_type = e1000_82545_rev_3;
- break;
- case E1000_DEV_ID_82546EB_COPPER:
- case E1000_DEV_ID_82546EB_FIBER:
- case E1000_DEV_ID_82546EB_QUAD_COPPER:
- hw->mac_type = e1000_82546;
- break;
- case E1000_DEV_ID_82546GB_COPPER:
- case E1000_DEV_ID_82546GB_FIBER:
- case E1000_DEV_ID_82546GB_SERDES:
- case E1000_DEV_ID_82546GB_PCIE:
- case E1000_DEV_ID_82546GB_QUAD_COPPER:
- case E1000_DEV_ID_82546GB_QUAD_COPPER_KSP3:
- hw->mac_type = e1000_82546_rev_3;
- break;
- case E1000_DEV_ID_82541EI:
- case E1000_DEV_ID_82541EI_MOBILE:
- case E1000_DEV_ID_82541ER_LOM:
- hw->mac_type = e1000_82541;
- break;
- case E1000_DEV_ID_82541ER:
- case E1000_DEV_ID_82541GI:
- case E1000_DEV_ID_82541GI_LF:
- case E1000_DEV_ID_82541GI_MOBILE:
- hw->mac_type = e1000_82541_rev_2;
- break;
- case E1000_DEV_ID_82547EI:
- case E1000_DEV_ID_82547EI_MOBILE:
- hw->mac_type = e1000_82547;
- break;
- case E1000_DEV_ID_82547GI:
- hw->mac_type = e1000_82547_rev_2;
- break;
- case E1000_DEV_ID_INTEL_CE4100_GBE:
- hw->mac_type = e1000_ce4100;
- break;
- default:
- /* Should never have loaded on this device */
- return -E1000_ERR_MAC_TYPE;
- }
-
- switch (hw->mac_type) {
- case e1000_82541:
- case e1000_82547:
- case e1000_82541_rev_2:
- case e1000_82547_rev_2:
- hw->asf_firmware_present = true;
- break;
- default:
- break;
- }
-
- /* The 82543 chip does not count tx_carrier_errors properly in
- * FD mode
- */
- if (hw->mac_type == e1000_82543)
- hw->bad_tx_carr_stats_fd = true;
-
- if (hw->mac_type > e1000_82544)
- hw->has_smbus = true;
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_set_media_type - Set media type and TBI compatibility.
- * @hw: Struct containing variables accessed by shared code
- */
-void e1000_set_media_type(struct e1000_hw *hw)
-{
- u32 status;
-
- e_dbg("e1000_set_media_type");
-
- if (hw->mac_type != e1000_82543) {
- /* tbi_compatibility is only valid on 82543 */
- hw->tbi_compatibility_en = false;
- }
-
- switch (hw->device_id) {
- case E1000_DEV_ID_82545GM_SERDES:
- case E1000_DEV_ID_82546GB_SERDES:
- hw->media_type = e1000_media_type_internal_serdes;
- break;
- default:
- switch (hw->mac_type) {
- case e1000_82542_rev2_0:
- case e1000_82542_rev2_1:
- hw->media_type = e1000_media_type_fiber;
- break;
- case e1000_ce4100:
- hw->media_type = e1000_media_type_copper;
- break;
- default:
- status = er32(STATUS);
- if (status & E1000_STATUS_TBIMODE) {
- hw->media_type = e1000_media_type_fiber;
- /* tbi_compatibility not valid on fiber */
- hw->tbi_compatibility_en = false;
- } else {
- hw->media_type = e1000_media_type_copper;
- }
- break;
- }
- }
-}
-
-/**
- * e1000_reset_hw: reset the hardware completely
- * @hw: Struct containing variables accessed by shared code
- *
- * Reset the transmit and receive units; mask and clear all interrupts.
- */
-s32 e1000_reset_hw(struct e1000_hw *hw)
-{
- u32 ctrl;
- u32 ctrl_ext;
- u32 icr;
- u32 manc;
- u32 led_ctrl;
- s32 ret_val;
-
- e_dbg("e1000_reset_hw");
-
- /* For 82542 (rev 2.0), disable MWI before issuing a device reset */
- if (hw->mac_type == e1000_82542_rev2_0) {
- e_dbg("Disabling MWI on 82542 rev 2.0\n");
- e1000_pci_clear_mwi(hw);
- }
-
- /* Clear interrupt mask to stop board from generating interrupts */
- e_dbg("Masking off all interrupts\n");
- ew32(IMC, 0xffffffff);
-
- /* Disable the Transmit and Receive units. Then delay to allow
- * any pending transactions to complete before we hit the MAC with
- * the global reset.
- */
- ew32(RCTL, 0);
- ew32(TCTL, E1000_TCTL_PSP);
- E1000_WRITE_FLUSH();
-
- /* The tbi_compatibility_on Flag must be cleared when Rctl is cleared. */
- hw->tbi_compatibility_on = false;
-
- /* Delay to allow any outstanding PCI transactions to complete before
- * resetting the device
- */
- msleep(10);
-
- ctrl = er32(CTRL);
-
- /* Must reset the PHY before resetting the MAC */
- if ((hw->mac_type == e1000_82541) || (hw->mac_type == e1000_82547)) {
- ew32(CTRL, (ctrl | E1000_CTRL_PHY_RST));
- msleep(5);
- }
-
- /* Issue a global reset to the MAC. This will reset the chip's
- * transmit, receive, DMA, and link units. It will not effect
- * the current PCI configuration. The global reset bit is self-
- * clearing, and should clear within a microsecond.
- */
- e_dbg("Issuing a global reset to MAC\n");
-
- switch (hw->mac_type) {
- case e1000_82544:
- case e1000_82540:
- case e1000_82545:
- case e1000_82546:
- case e1000_82541:
- case e1000_82541_rev_2:
- /* These controllers can't ack the 64-bit write when issuing the
- * reset, so use IO-mapping as a workaround to issue the reset */
- E1000_WRITE_REG_IO(hw, CTRL, (ctrl | E1000_CTRL_RST));
- break;
- case e1000_82545_rev_3:
- case e1000_82546_rev_3:
- /* Reset is performed on a shadow of the control register */
- ew32(CTRL_DUP, (ctrl | E1000_CTRL_RST));
- break;
- case e1000_ce4100:
- default:
- ew32(CTRL, (ctrl | E1000_CTRL_RST));
- break;
- }
-
- /* After MAC reset, force reload of EEPROM to restore power-on settings to
- * device. Later controllers reload the EEPROM automatically, so just wait
- * for reload to complete.
- */
- switch (hw->mac_type) {
- case e1000_82542_rev2_0:
- case e1000_82542_rev2_1:
- case e1000_82543:
- case e1000_82544:
- /* Wait for reset to complete */
- udelay(10);
- ctrl_ext = er32(CTRL_EXT);
- ctrl_ext |= E1000_CTRL_EXT_EE_RST;
- ew32(CTRL_EXT, ctrl_ext);
- E1000_WRITE_FLUSH();
- /* Wait for EEPROM reload */
- msleep(2);
- break;
- case e1000_82541:
- case e1000_82541_rev_2:
- case e1000_82547:
- case e1000_82547_rev_2:
- /* Wait for EEPROM reload */
- msleep(20);
- break;
- default:
- /* Auto read done will delay 5ms or poll based on mac type */
- ret_val = e1000_get_auto_rd_done(hw);
- if (ret_val)
- return ret_val;
- break;
- }
-
- /* Disable HW ARPs on ASF enabled adapters */
- if (hw->mac_type >= e1000_82540) {
- manc = er32(MANC);
- manc &= ~(E1000_MANC_ARP_EN);
- ew32(MANC, manc);
- }
-
- if ((hw->mac_type == e1000_82541) || (hw->mac_type == e1000_82547)) {
- e1000_phy_init_script(hw);
-
- /* Configure activity LED after PHY reset */
- led_ctrl = er32(LEDCTL);
- led_ctrl &= IGP_ACTIVITY_LED_MASK;
- led_ctrl |= (IGP_ACTIVITY_LED_ENABLE | IGP_LED3_MODE);
- ew32(LEDCTL, led_ctrl);
- }
-
- /* Clear interrupt mask to stop board from generating interrupts */
- e_dbg("Masking off all interrupts\n");
- ew32(IMC, 0xffffffff);
-
- /* Clear any pending interrupt events. */
- icr = er32(ICR);
-
- /* If MWI was previously enabled, reenable it. */
- if (hw->mac_type == e1000_82542_rev2_0) {
- if (hw->pci_cmd_word & PCI_COMMAND_INVALIDATE)
- e1000_pci_set_mwi(hw);
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_init_hw: Performs basic configuration of the adapter.
- * @hw: Struct containing variables accessed by shared code
- *
- * Assumes that the controller has previously been reset and is in a
- * post-reset uninitialized state. Initializes the receive address registers,
- * multicast table, and VLAN filter table. Calls routines to setup link
- * configuration and flow control settings. Clears all on-chip counters. Leaves
- * the transmit and receive units disabled and uninitialized.
- */
-s32 e1000_init_hw(struct e1000_hw *hw)
-{
- u32 ctrl;
- u32 i;
- s32 ret_val;
- u32 mta_size;
- u32 ctrl_ext;
-
- e_dbg("e1000_init_hw");
-
- /* Initialize Identification LED */
- ret_val = e1000_id_led_init(hw);
- if (ret_val) {
- e_dbg("Error Initializing Identification LED\n");
- return ret_val;
- }
-
- /* Set the media type and TBI compatibility */
- e1000_set_media_type(hw);
-
- /* Disabling VLAN filtering. */
- e_dbg("Initializing the IEEE VLAN\n");
- if (hw->mac_type < e1000_82545_rev_3)
- ew32(VET, 0);
- e1000_clear_vfta(hw);
-
- /* For 82542 (rev 2.0), disable MWI and put the receiver into reset */
- if (hw->mac_type == e1000_82542_rev2_0) {
- e_dbg("Disabling MWI on 82542 rev 2.0\n");
- e1000_pci_clear_mwi(hw);
- ew32(RCTL, E1000_RCTL_RST);
- E1000_WRITE_FLUSH();
- msleep(5);
- }
-
- /* Setup the receive address. This involves initializing all of the Receive
- * Address Registers (RARs 0 - 15).
- */
- e1000_init_rx_addrs(hw);
-
- /* For 82542 (rev 2.0), take the receiver out of reset and enable MWI */
- if (hw->mac_type == e1000_82542_rev2_0) {
- ew32(RCTL, 0);
- E1000_WRITE_FLUSH();
- msleep(1);
- if (hw->pci_cmd_word & PCI_COMMAND_INVALIDATE)
- e1000_pci_set_mwi(hw);
- }
-
- /* Zero out the Multicast HASH table */
- e_dbg("Zeroing the MTA\n");
- mta_size = E1000_MC_TBL_SIZE;
- for (i = 0; i < mta_size; i++) {
- E1000_WRITE_REG_ARRAY(hw, MTA, i, 0);
- /* use write flush to prevent Memory Write Block (MWB) from
- * occurring when accessing our register space */
- E1000_WRITE_FLUSH();
- }
-
- /* Set the PCI priority bit correctly in the CTRL register. This
- * determines if the adapter gives priority to receives, or if it
- * gives equal priority to transmits and receives. Valid only on
- * 82542 and 82543 silicon.
- */
- if (hw->dma_fairness && hw->mac_type <= e1000_82543) {
- ctrl = er32(CTRL);
- ew32(CTRL, ctrl | E1000_CTRL_PRIOR);
- }
-
- switch (hw->mac_type) {
- case e1000_82545_rev_3:
- case e1000_82546_rev_3:
- break;
- default:
- /* Workaround for PCI-X problem when BIOS sets MMRBC incorrectly. */
- if (hw->bus_type == e1000_bus_type_pcix
- && e1000_pcix_get_mmrbc(hw) > 2048)
- e1000_pcix_set_mmrbc(hw, 2048);
- break;
- }
-
- /* Call a subroutine to configure the link and setup flow control. */
- ret_val = e1000_setup_link(hw);
-
- /* Set the transmit descriptor write-back policy */
- if (hw->mac_type > e1000_82544) {
- ctrl = er32(TXDCTL);
- ctrl =
- (ctrl & ~E1000_TXDCTL_WTHRESH) |
- E1000_TXDCTL_FULL_TX_DESC_WB;
- ew32(TXDCTL, ctrl);
- }
-
- /* Clear all of the statistics registers (clear on read). It is
- * important that we do this after we have tried to establish link
- * because the symbol error count will increment wildly if there
- * is no link.
- */
- e1000_clear_hw_cntrs(hw);
-
- if (hw->device_id == E1000_DEV_ID_82546GB_QUAD_COPPER ||
- hw->device_id == E1000_DEV_ID_82546GB_QUAD_COPPER_KSP3) {
- ctrl_ext = er32(CTRL_EXT);
- /* Relaxed ordering must be disabled to avoid a parity
- * error crash in a PCI slot. */
- ctrl_ext |= E1000_CTRL_EXT_RO_DIS;
- ew32(CTRL_EXT, ctrl_ext);
- }
-
- return ret_val;
-}
-
-/**
- * e1000_adjust_serdes_amplitude - Adjust SERDES output amplitude based on EEPROM setting.
- * @hw: Struct containing variables accessed by shared code.
- */
-static s32 e1000_adjust_serdes_amplitude(struct e1000_hw *hw)
-{
- u16 eeprom_data;
- s32 ret_val;
-
- e_dbg("e1000_adjust_serdes_amplitude");
-
- if (hw->media_type != e1000_media_type_internal_serdes)
- return E1000_SUCCESS;
-
- switch (hw->mac_type) {
- case e1000_82545_rev_3:
- case e1000_82546_rev_3:
- break;
- default:
- return E1000_SUCCESS;
- }
-
- ret_val = e1000_read_eeprom(hw, EEPROM_SERDES_AMPLITUDE, 1,
- &eeprom_data);
- if (ret_val) {
- return ret_val;
- }
-
- if (eeprom_data != EEPROM_RESERVED_WORD) {
- /* Adjust SERDES output amplitude only. */
- eeprom_data &= EEPROM_SERDES_AMPLITUDE_MASK;
- ret_val =
- e1000_write_phy_reg(hw, M88E1000_PHY_EXT_CTRL, eeprom_data);
- if (ret_val)
- return ret_val;
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_setup_link - Configures flow control and link settings.
- * @hw: Struct containing variables accessed by shared code
- *
- * Determines which flow control settings to use. Calls the appropriate media-
- * specific link configuration function. Configures the flow control settings.
- * Assuming the adapter has a valid link partner, a valid link should be
- * established. Assumes the hardware has previously been reset and the
- * transmitter and receiver are not enabled.
- */
-s32 e1000_setup_link(struct e1000_hw *hw)
-{
- u32 ctrl_ext;
- s32 ret_val;
- u16 eeprom_data;
-
- e_dbg("e1000_setup_link");
-
- /* Read and store word 0x0F of the EEPROM. This word contains bits
- * that determine the hardware's default PAUSE (flow control) mode,
- * a bit that determines whether the HW defaults to enabling or
- * disabling auto-negotiation, and the direction of the
- * SW defined pins. If there is no SW over-ride of the flow
- * control setting, then the variable hw->fc will
- * be initialized based on a value in the EEPROM.
- */
- if (hw->fc == E1000_FC_DEFAULT) {
- ret_val = e1000_read_eeprom(hw, EEPROM_INIT_CONTROL2_REG,
- 1, &eeprom_data);
- if (ret_val) {
- e_dbg("EEPROM Read Error\n");
- return -E1000_ERR_EEPROM;
- }
- if ((eeprom_data & EEPROM_WORD0F_PAUSE_MASK) == 0)
- hw->fc = E1000_FC_NONE;
- else if ((eeprom_data & EEPROM_WORD0F_PAUSE_MASK) ==
- EEPROM_WORD0F_ASM_DIR)
- hw->fc = E1000_FC_TX_PAUSE;
- else
- hw->fc = E1000_FC_FULL;
- }
-
- /* We want to save off the original Flow Control configuration just
- * in case we get disconnected and then reconnected into a different
- * hub or switch with different Flow Control capabilities.
- */
- if (hw->mac_type == e1000_82542_rev2_0)
- hw->fc &= (~E1000_FC_TX_PAUSE);
-
- if ((hw->mac_type < e1000_82543) && (hw->report_tx_early == 1))
- hw->fc &= (~E1000_FC_RX_PAUSE);
-
- hw->original_fc = hw->fc;
-
- e_dbg("After fix-ups FlowControl is now = %x\n", hw->fc);
-
- /* Take the 4 bits from EEPROM word 0x0F that determine the initial
- * polarity value for the SW controlled pins, and setup the
- * Extended Device Control reg with that info.
- * This is needed because one of the SW controlled pins is used for
- * signal detection. So this should be done before e1000_setup_pcs_link()
- * or e1000_phy_setup() is called.
- */
- if (hw->mac_type == e1000_82543) {
- ret_val = e1000_read_eeprom(hw, EEPROM_INIT_CONTROL2_REG,
- 1, &eeprom_data);
- if (ret_val) {
- e_dbg("EEPROM Read Error\n");
- return -E1000_ERR_EEPROM;
- }
- ctrl_ext = ((eeprom_data & EEPROM_WORD0F_SWPDIO_EXT) <<
- SWDPIO__EXT_SHIFT);
- ew32(CTRL_EXT, ctrl_ext);
- }
-
- /* Call the necessary subroutine to configure the link. */
- ret_val = (hw->media_type == e1000_media_type_copper) ?
- e1000_setup_copper_link(hw) : e1000_setup_fiber_serdes_link(hw);
-
- /* Initialize the flow control address, type, and PAUSE timer
- * registers to their default values. This is done even if flow
- * control is disabled, because it does not hurt anything to
- * initialize these registers.
- */
- e_dbg("Initializing the Flow Control address, type and timer regs\n");
-
- ew32(FCT, FLOW_CONTROL_TYPE);
- ew32(FCAH, FLOW_CONTROL_ADDRESS_HIGH);
- ew32(FCAL, FLOW_CONTROL_ADDRESS_LOW);
-
- ew32(FCTTV, hw->fc_pause_time);
-
- /* Set the flow control receive threshold registers. Normally,
- * these registers will be set to a default threshold that may be
- * adjusted later by the driver's runtime code. However, if the
- * ability to transmit pause frames in not enabled, then these
- * registers will be set to 0.
- */
- if (!(hw->fc & E1000_FC_TX_PAUSE)) {
- ew32(FCRTL, 0);
- ew32(FCRTH, 0);
- } else {
- /* We need to set up the Receive Threshold high and low water marks
- * as well as (optionally) enabling the transmission of XON frames.
- */
- if (hw->fc_send_xon) {
- ew32(FCRTL, (hw->fc_low_water | E1000_FCRTL_XONE));
- ew32(FCRTH, hw->fc_high_water);
- } else {
- ew32(FCRTL, hw->fc_low_water);
- ew32(FCRTH, hw->fc_high_water);
- }
- }
- return ret_val;
-}
-
-/**
- * e1000_setup_fiber_serdes_link - prepare fiber or serdes link
- * @hw: Struct containing variables accessed by shared code
- *
- * Manipulates Physical Coding Sublayer functions in order to configure
- * link. Assumes the hardware has been previously reset and the transmitter
- * and receiver are not enabled.
- */
-static s32 e1000_setup_fiber_serdes_link(struct e1000_hw *hw)
-{
- u32 ctrl;
- u32 status;
- u32 txcw = 0;
- u32 i;
- u32 signal = 0;
- s32 ret_val;
-
- e_dbg("e1000_setup_fiber_serdes_link");
-
- /* On adapters with a MAC newer than 82544, SWDP 1 will be
- * set when the optics detect a signal. On older adapters, it will be
- * cleared when there is a signal. This applies to fiber media only.
- * If we're on serdes media, adjust the output amplitude to value
- * set in the EEPROM.
- */
- ctrl = er32(CTRL);
- if (hw->media_type == e1000_media_type_fiber)
- signal = (hw->mac_type > e1000_82544) ? E1000_CTRL_SWDPIN1 : 0;
-
- ret_val = e1000_adjust_serdes_amplitude(hw);
- if (ret_val)
- return ret_val;
-
- /* Take the link out of reset */
- ctrl &= ~(E1000_CTRL_LRST);
-
- /* Adjust VCO speed to improve BER performance */
- ret_val = e1000_set_vco_speed(hw);
- if (ret_val)
- return ret_val;
-
- e1000_config_collision_dist(hw);
-
- /* Check for a software override of the flow control settings, and setup
- * the device accordingly. If auto-negotiation is enabled, then software
- * will have to set the "PAUSE" bits to the correct value in the Tranmsit
- * Config Word Register (TXCW) and re-start auto-negotiation. However, if
- * auto-negotiation is disabled, then software will have to manually
- * configure the two flow control enable bits in the CTRL register.
- *
- * The possible values of the "fc" parameter are:
- * 0: Flow control is completely disabled
- * 1: Rx flow control is enabled (we can receive pause frames, but
- * not send pause frames).
- * 2: Tx flow control is enabled (we can send pause frames but we do
- * not support receiving pause frames).
- * 3: Both Rx and TX flow control (symmetric) are enabled.
- */
- switch (hw->fc) {
- case E1000_FC_NONE:
- /* Flow control is completely disabled by a software over-ride. */
- txcw = (E1000_TXCW_ANE | E1000_TXCW_FD);
- break;
- case E1000_FC_RX_PAUSE:
- /* RX Flow control is enabled and TX Flow control is disabled by a
- * software over-ride. Since there really isn't a way to advertise
- * that we are capable of RX Pause ONLY, we will advertise that we
- * support both symmetric and asymmetric RX PAUSE. Later, we will
- * disable the adapter's ability to send PAUSE frames.
- */
- txcw = (E1000_TXCW_ANE | E1000_TXCW_FD | E1000_TXCW_PAUSE_MASK);
- break;
- case E1000_FC_TX_PAUSE:
- /* TX Flow control is enabled, and RX Flow control is disabled, by a
- * software over-ride.
- */
- txcw = (E1000_TXCW_ANE | E1000_TXCW_FD | E1000_TXCW_ASM_DIR);
- break;
- case E1000_FC_FULL:
- /* Flow control (both RX and TX) is enabled by a software over-ride. */
- txcw = (E1000_TXCW_ANE | E1000_TXCW_FD | E1000_TXCW_PAUSE_MASK);
- break;
- default:
- e_dbg("Flow control param set incorrectly\n");
- return -E1000_ERR_CONFIG;
- break;
- }
-
- /* Since auto-negotiation is enabled, take the link out of reset (the link
- * will be in reset, because we previously reset the chip). This will
- * restart auto-negotiation. If auto-negotiation is successful then the
- * link-up status bit will be set and the flow control enable bits (RFCE
- * and TFCE) will be set according to their negotiated value.
- */
- e_dbg("Auto-negotiation enabled\n");
-
- ew32(TXCW, txcw);
- ew32(CTRL, ctrl);
- E1000_WRITE_FLUSH();
-
- hw->txcw = txcw;
- msleep(1);
-
- /* If we have a signal (the cable is plugged in) then poll for a "Link-Up"
- * indication in the Device Status Register. Time-out if a link isn't
- * seen in 500 milliseconds seconds (Auto-negotiation should complete in
- * less than 500 milliseconds even if the other end is doing it in SW).
- * For internal serdes, we just assume a signal is present, then poll.
- */
- if (hw->media_type == e1000_media_type_internal_serdes ||
- (er32(CTRL) & E1000_CTRL_SWDPIN1) == signal) {
- e_dbg("Looking for Link\n");
- for (i = 0; i < (LINK_UP_TIMEOUT / 10); i++) {
- msleep(10);
- status = er32(STATUS);
- if (status & E1000_STATUS_LU)
- break;
- }
- if (i == (LINK_UP_TIMEOUT / 10)) {
- e_dbg("Never got a valid link from auto-neg!!!\n");
- hw->autoneg_failed = 1;
- /* AutoNeg failed to achieve a link, so we'll call
- * e1000_check_for_link. This routine will force the link up if
- * we detect a signal. This will allow us to communicate with
- * non-autonegotiating link partners.
- */
- ret_val = e1000_check_for_link(hw);
- if (ret_val) {
- e_dbg("Error while checking for link\n");
- return ret_val;
- }
- hw->autoneg_failed = 0;
- } else {
- hw->autoneg_failed = 0;
- e_dbg("Valid Link Found\n");
- }
- } else {
- e_dbg("No Signal Detected\n");
- }
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_copper_link_rtl_setup - Copper link setup for e1000_phy_rtl series.
- * @hw: Struct containing variables accessed by shared code
- *
- * Commits changes to PHY configuration by calling e1000_phy_reset().
- */
-static s32 e1000_copper_link_rtl_setup(struct e1000_hw *hw)
-{
- s32 ret_val;
-
- /* SW reset the PHY so all changes take effect */
- ret_val = e1000_phy_reset(hw);
- if (ret_val) {
- e_dbg("Error Resetting the PHY\n");
- return ret_val;
- }
-
- return E1000_SUCCESS;
-}
-
-static s32 gbe_dhg_phy_setup(struct e1000_hw *hw)
-{
- s32 ret_val;
- u32 ctrl_aux;
-
- switch (hw->phy_type) {
- case e1000_phy_8211:
- ret_val = e1000_copper_link_rtl_setup(hw);
- if (ret_val) {
- e_dbg("e1000_copper_link_rtl_setup failed!\n");
- return ret_val;
- }
- break;
- case e1000_phy_8201:
- /* Set RMII mode */
- ctrl_aux = er32(CTL_AUX);
- ctrl_aux |= E1000_CTL_AUX_RMII;
- ew32(CTL_AUX, ctrl_aux);
- E1000_WRITE_FLUSH();
-
- /* Disable the J/K bits required for receive */
- ctrl_aux = er32(CTL_AUX);
- ctrl_aux |= 0x4;
- ctrl_aux &= ~0x2;
- ew32(CTL_AUX, ctrl_aux);
- E1000_WRITE_FLUSH();
- ret_val = e1000_copper_link_rtl_setup(hw);
-
- if (ret_val) {
- e_dbg("e1000_copper_link_rtl_setup failed!\n");
- return ret_val;
- }
- break;
- default:
- e_dbg("Error Resetting the PHY\n");
- return E1000_ERR_PHY_TYPE;
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_copper_link_preconfig - early configuration for copper
- * @hw: Struct containing variables accessed by shared code
- *
- * Make sure we have a valid PHY and change PHY mode before link setup.
- */
-static s32 e1000_copper_link_preconfig(struct e1000_hw *hw)
-{
- u32 ctrl;
- s32 ret_val;
- u16 phy_data;
-
- e_dbg("e1000_copper_link_preconfig");
-
- ctrl = er32(CTRL);
- /* With 82543, we need to force speed and duplex on the MAC equal to what
- * the PHY speed and duplex configuration is. In addition, we need to
- * perform a hardware reset on the PHY to take it out of reset.
- */
- if (hw->mac_type > e1000_82543) {
- ctrl |= E1000_CTRL_SLU;
- ctrl &= ~(E1000_CTRL_FRCSPD | E1000_CTRL_FRCDPX);
- ew32(CTRL, ctrl);
- } else {
- ctrl |=
- (E1000_CTRL_FRCSPD | E1000_CTRL_FRCDPX | E1000_CTRL_SLU);
- ew32(CTRL, ctrl);
- ret_val = e1000_phy_hw_reset(hw);
- if (ret_val)
- return ret_val;
- }
-
- /* Make sure we have a valid PHY */
- ret_val = e1000_detect_gig_phy(hw);
- if (ret_val) {
- e_dbg("Error, did not detect valid phy.\n");
- return ret_val;
- }
- e_dbg("Phy ID = %x\n", hw->phy_id);
-
- /* Set PHY to class A mode (if necessary) */
- ret_val = e1000_set_phy_mode(hw);
- if (ret_val)
- return ret_val;
-
- if ((hw->mac_type == e1000_82545_rev_3) ||
- (hw->mac_type == e1000_82546_rev_3)) {
- ret_val =
- e1000_read_phy_reg(hw, M88E1000_PHY_SPEC_CTRL, &phy_data);
- phy_data |= 0x00000008;
- ret_val =
- e1000_write_phy_reg(hw, M88E1000_PHY_SPEC_CTRL, phy_data);
- }
-
- if (hw->mac_type <= e1000_82543 ||
- hw->mac_type == e1000_82541 || hw->mac_type == e1000_82547 ||
- hw->mac_type == e1000_82541_rev_2
- || hw->mac_type == e1000_82547_rev_2)
- hw->phy_reset_disable = false;
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_copper_link_igp_setup - Copper link setup for e1000_phy_igp series.
- * @hw: Struct containing variables accessed by shared code
- */
-static s32 e1000_copper_link_igp_setup(struct e1000_hw *hw)
-{
- u32 led_ctrl;
- s32 ret_val;
- u16 phy_data;
-
- e_dbg("e1000_copper_link_igp_setup");
-
- if (hw->phy_reset_disable)
- return E1000_SUCCESS;
-
- ret_val = e1000_phy_reset(hw);
- if (ret_val) {
- e_dbg("Error Resetting the PHY\n");
- return ret_val;
- }
-
- /* Wait 15ms for MAC to configure PHY from eeprom settings */
- msleep(15);
- /* Configure activity LED after PHY reset */
- led_ctrl = er32(LEDCTL);
- led_ctrl &= IGP_ACTIVITY_LED_MASK;
- led_ctrl |= (IGP_ACTIVITY_LED_ENABLE | IGP_LED3_MODE);
- ew32(LEDCTL, led_ctrl);
-
- /* The NVM settings will configure LPLU in D3 for IGP2 and IGP3 PHYs */
- if (hw->phy_type == e1000_phy_igp) {
- /* disable lplu d3 during driver init */
- ret_val = e1000_set_d3_lplu_state(hw, false);
- if (ret_val) {
- e_dbg("Error Disabling LPLU D3\n");
- return ret_val;
- }
- }
-
- /* Configure mdi-mdix settings */
- ret_val = e1000_read_phy_reg(hw, IGP01E1000_PHY_PORT_CTRL, &phy_data);
- if (ret_val)
- return ret_val;
-
- if ((hw->mac_type == e1000_82541) || (hw->mac_type == e1000_82547)) {
- hw->dsp_config_state = e1000_dsp_config_disabled;
- /* Force MDI for earlier revs of the IGP PHY */
- phy_data &=
- ~(IGP01E1000_PSCR_AUTO_MDIX |
- IGP01E1000_PSCR_FORCE_MDI_MDIX);
- hw->mdix = 1;
-
- } else {
- hw->dsp_config_state = e1000_dsp_config_enabled;
- phy_data &= ~IGP01E1000_PSCR_AUTO_MDIX;
-
- switch (hw->mdix) {
- case 1:
- phy_data &= ~IGP01E1000_PSCR_FORCE_MDI_MDIX;
- break;
- case 2:
- phy_data |= IGP01E1000_PSCR_FORCE_MDI_MDIX;
- break;
- case 0:
- default:
- phy_data |= IGP01E1000_PSCR_AUTO_MDIX;
- break;
- }
- }
- ret_val = e1000_write_phy_reg(hw, IGP01E1000_PHY_PORT_CTRL, phy_data);
- if (ret_val)
- return ret_val;
-
- /* set auto-master slave resolution settings */
- if (hw->autoneg) {
- e1000_ms_type phy_ms_setting = hw->master_slave;
-
- if (hw->ffe_config_state == e1000_ffe_config_active)
- hw->ffe_config_state = e1000_ffe_config_enabled;
-
- if (hw->dsp_config_state == e1000_dsp_config_activated)
- hw->dsp_config_state = e1000_dsp_config_enabled;
-
- /* when autonegotiation advertisement is only 1000Mbps then we
- * should disable SmartSpeed and enable Auto MasterSlave
- * resolution as hardware default. */
- if (hw->autoneg_advertised == ADVERTISE_1000_FULL) {
- /* Disable SmartSpeed */
- ret_val =
- e1000_read_phy_reg(hw, IGP01E1000_PHY_PORT_CONFIG,
- &phy_data);
- if (ret_val)
- return ret_val;
- phy_data &= ~IGP01E1000_PSCFR_SMART_SPEED;
- ret_val =
- e1000_write_phy_reg(hw, IGP01E1000_PHY_PORT_CONFIG,
- phy_data);
- if (ret_val)
- return ret_val;
- /* Set auto Master/Slave resolution process */
- ret_val =
- e1000_read_phy_reg(hw, PHY_1000T_CTRL, &phy_data);
- if (ret_val)
- return ret_val;
- phy_data &= ~CR_1000T_MS_ENABLE;
- ret_val =
- e1000_write_phy_reg(hw, PHY_1000T_CTRL, phy_data);
- if (ret_val)
- return ret_val;
- }
-
- ret_val = e1000_read_phy_reg(hw, PHY_1000T_CTRL, &phy_data);
- if (ret_val)
- return ret_val;
-
- /* load defaults for future use */
- hw->original_master_slave = (phy_data & CR_1000T_MS_ENABLE) ?
- ((phy_data & CR_1000T_MS_VALUE) ?
- e1000_ms_force_master :
- e1000_ms_force_slave) : e1000_ms_auto;
-
- switch (phy_ms_setting) {
- case e1000_ms_force_master:
- phy_data |= (CR_1000T_MS_ENABLE | CR_1000T_MS_VALUE);
- break;
- case e1000_ms_force_slave:
- phy_data |= CR_1000T_MS_ENABLE;
- phy_data &= ~(CR_1000T_MS_VALUE);
- break;
- case e1000_ms_auto:
- phy_data &= ~CR_1000T_MS_ENABLE;
- default:
- break;
- }
- ret_val = e1000_write_phy_reg(hw, PHY_1000T_CTRL, phy_data);
- if (ret_val)
- return ret_val;
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_copper_link_mgp_setup - Copper link setup for e1000_phy_m88 series.
- * @hw: Struct containing variables accessed by shared code
- */
-static s32 e1000_copper_link_mgp_setup(struct e1000_hw *hw)
-{
- s32 ret_val;
- u16 phy_data;
-
- e_dbg("e1000_copper_link_mgp_setup");
-
- if (hw->phy_reset_disable)
- return E1000_SUCCESS;
-
- /* Enable CRS on TX. This must be set for half-duplex operation. */
- ret_val = e1000_read_phy_reg(hw, M88E1000_PHY_SPEC_CTRL, &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_data |= M88E1000_PSCR_ASSERT_CRS_ON_TX;
-
- /* Options:
- * MDI/MDI-X = 0 (default)
- * 0 - Auto for all speeds
- * 1 - MDI mode
- * 2 - MDI-X mode
- * 3 - Auto for 1000Base-T only (MDI-X for 10/100Base-T modes)
- */
- phy_data &= ~M88E1000_PSCR_AUTO_X_MODE;
-
- switch (hw->mdix) {
- case 1:
- phy_data |= M88E1000_PSCR_MDI_MANUAL_MODE;
- break;
- case 2:
- phy_data |= M88E1000_PSCR_MDIX_MANUAL_MODE;
- break;
- case 3:
- phy_data |= M88E1000_PSCR_AUTO_X_1000T;
- break;
- case 0:
- default:
- phy_data |= M88E1000_PSCR_AUTO_X_MODE;
- break;
- }
-
- /* Options:
- * disable_polarity_correction = 0 (default)
- * Automatic Correction for Reversed Cable Polarity
- * 0 - Disabled
- * 1 - Enabled
- */
- phy_data &= ~M88E1000_PSCR_POLARITY_REVERSAL;
- if (hw->disable_polarity_correction == 1)
- phy_data |= M88E1000_PSCR_POLARITY_REVERSAL;
- ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_SPEC_CTRL, phy_data);
- if (ret_val)
- return ret_val;
-
- if (hw->phy_revision < M88E1011_I_REV_4) {
- /* Force TX_CLK in the Extended PHY Specific Control Register
- * to 25MHz clock.
- */
- ret_val =
- e1000_read_phy_reg(hw, M88E1000_EXT_PHY_SPEC_CTRL,
- &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_data |= M88E1000_EPSCR_TX_CLK_25;
-
- if ((hw->phy_revision == E1000_REVISION_2) &&
- (hw->phy_id == M88E1111_I_PHY_ID)) {
- /* Vidalia Phy, set the downshift counter to 5x */
- phy_data &= ~(M88EC018_EPSCR_DOWNSHIFT_COUNTER_MASK);
- phy_data |= M88EC018_EPSCR_DOWNSHIFT_COUNTER_5X;
- ret_val = e1000_write_phy_reg(hw,
- M88E1000_EXT_PHY_SPEC_CTRL,
- phy_data);
- if (ret_val)
- return ret_val;
- } else {
- /* Configure Master and Slave downshift values */
- phy_data &= ~(M88E1000_EPSCR_MASTER_DOWNSHIFT_MASK |
- M88E1000_EPSCR_SLAVE_DOWNSHIFT_MASK);
- phy_data |= (M88E1000_EPSCR_MASTER_DOWNSHIFT_1X |
- M88E1000_EPSCR_SLAVE_DOWNSHIFT_1X);
- ret_val = e1000_write_phy_reg(hw,
- M88E1000_EXT_PHY_SPEC_CTRL,
- phy_data);
- if (ret_val)
- return ret_val;
- }
- }
-
- /* SW Reset the PHY so all changes take effect */
- ret_val = e1000_phy_reset(hw);
- if (ret_val) {
- e_dbg("Error Resetting the PHY\n");
- return ret_val;
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_copper_link_autoneg - setup auto-neg
- * @hw: Struct containing variables accessed by shared code
- *
- * Setup auto-negotiation and flow control advertisements,
- * and then perform auto-negotiation.
- */
-static s32 e1000_copper_link_autoneg(struct e1000_hw *hw)
-{
- s32 ret_val;
- u16 phy_data;
-
- e_dbg("e1000_copper_link_autoneg");
-
- /* Perform some bounds checking on the hw->autoneg_advertised
- * parameter. If this variable is zero, then set it to the default.
- */
- hw->autoneg_advertised &= AUTONEG_ADVERTISE_SPEED_DEFAULT;
-
- /* If autoneg_advertised is zero, we assume it was not defaulted
- * by the calling code so we set to advertise full capability.
- */
- if (hw->autoneg_advertised == 0)
- hw->autoneg_advertised = AUTONEG_ADVERTISE_SPEED_DEFAULT;
-
- /* IFE/RTL8201N PHY only supports 10/100 */
- if (hw->phy_type == e1000_phy_8201)
- hw->autoneg_advertised &= AUTONEG_ADVERTISE_10_100_ALL;
-
- e_dbg("Reconfiguring auto-neg advertisement params\n");
- ret_val = e1000_phy_setup_autoneg(hw);
- if (ret_val) {
- e_dbg("Error Setting up Auto-Negotiation\n");
- return ret_val;
- }
- e_dbg("Restarting Auto-Neg\n");
-
- /* Restart auto-negotiation by setting the Auto Neg Enable bit and
- * the Auto Neg Restart bit in the PHY control register.
- */
- ret_val = e1000_read_phy_reg(hw, PHY_CTRL, &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_data |= (MII_CR_AUTO_NEG_EN | MII_CR_RESTART_AUTO_NEG);
- ret_val = e1000_write_phy_reg(hw, PHY_CTRL, phy_data);
- if (ret_val)
- return ret_val;
-
- /* Does the user want to wait for Auto-Neg to complete here, or
- * check at a later time (for example, callback routine).
- */
- if (hw->wait_autoneg_complete) {
- ret_val = e1000_wait_autoneg(hw);
- if (ret_val) {
- e_dbg
- ("Error while waiting for autoneg to complete\n");
- return ret_val;
- }
- }
-
- hw->get_link_status = true;
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_copper_link_postconfig - post link setup
- * @hw: Struct containing variables accessed by shared code
- *
- * Config the MAC and the PHY after link is up.
- * 1) Set up the MAC to the current PHY speed/duplex
- * if we are on 82543. If we
- * are on newer silicon, we only need to configure
- * collision distance in the Transmit Control Register.
- * 2) Set up flow control on the MAC to that established with
- * the link partner.
- * 3) Config DSP to improve Gigabit link quality for some PHY revisions.
- */
-static s32 e1000_copper_link_postconfig(struct e1000_hw *hw)
-{
- s32 ret_val;
- e_dbg("e1000_copper_link_postconfig");
-
- if ((hw->mac_type >= e1000_82544) && (hw->mac_type != e1000_ce4100)) {
- e1000_config_collision_dist(hw);
- } else {
- ret_val = e1000_config_mac_to_phy(hw);
- if (ret_val) {
- e_dbg("Error configuring MAC to PHY settings\n");
- return ret_val;
- }
- }
- ret_val = e1000_config_fc_after_link_up(hw);
- if (ret_val) {
- e_dbg("Error Configuring Flow Control\n");
- return ret_val;
- }
-
- /* Config DSP to improve Giga link quality */
- if (hw->phy_type == e1000_phy_igp) {
- ret_val = e1000_config_dsp_after_link_change(hw, true);
- if (ret_val) {
- e_dbg("Error Configuring DSP after link up\n");
- return ret_val;
- }
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_setup_copper_link - phy/speed/duplex setting
- * @hw: Struct containing variables accessed by shared code
- *
- * Detects which PHY is present and sets up the speed and duplex
- */
-static s32 e1000_setup_copper_link(struct e1000_hw *hw)
-{
- s32 ret_val;
- u16 i;
- u16 phy_data;
-
- e_dbg("e1000_setup_copper_link");
-
- /* Check if it is a valid PHY and set PHY mode if necessary. */
- ret_val = e1000_copper_link_preconfig(hw);
- if (ret_val)
- return ret_val;
-
- if (hw->phy_type == e1000_phy_igp) {
- ret_val = e1000_copper_link_igp_setup(hw);
- if (ret_val)
- return ret_val;
- } else if (hw->phy_type == e1000_phy_m88) {
- ret_val = e1000_copper_link_mgp_setup(hw);
- if (ret_val)
- return ret_val;
- } else {
- ret_val = gbe_dhg_phy_setup(hw);
- if (ret_val) {
- e_dbg("gbe_dhg_phy_setup failed!\n");
- return ret_val;
- }
- }
-
- if (hw->autoneg) {
- /* Setup autoneg and flow control advertisement
- * and perform autonegotiation */
- ret_val = e1000_copper_link_autoneg(hw);
- if (ret_val)
- return ret_val;
- } else {
- /* PHY will be set to 10H, 10F, 100H,or 100F
- * depending on value from forced_speed_duplex. */
- e_dbg("Forcing speed and duplex\n");
- ret_val = e1000_phy_force_speed_duplex(hw);
- if (ret_val) {
- e_dbg("Error Forcing Speed and Duplex\n");
- return ret_val;
- }
- }
-
- /* Check link status. Wait up to 100 microseconds for link to become
- * valid.
- */
- for (i = 0; i < 10; i++) {
- ret_val = e1000_read_phy_reg(hw, PHY_STATUS, &phy_data);
- if (ret_val)
- return ret_val;
- ret_val = e1000_read_phy_reg(hw, PHY_STATUS, &phy_data);
- if (ret_val)
- return ret_val;
-
- if (phy_data & MII_SR_LINK_STATUS) {
- /* Config the MAC and PHY after link is up */
- ret_val = e1000_copper_link_postconfig(hw);
- if (ret_val)
- return ret_val;
-
- e_dbg("Valid link established!!!\n");
- return E1000_SUCCESS;
- }
- udelay(10);
- }
-
- e_dbg("Unable to establish link!!!\n");
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_phy_setup_autoneg - phy settings
- * @hw: Struct containing variables accessed by shared code
- *
- * Configures PHY autoneg and flow control advertisement settings
- */
-s32 e1000_phy_setup_autoneg(struct e1000_hw *hw)
-{
- s32 ret_val;
- u16 mii_autoneg_adv_reg;
- u16 mii_1000t_ctrl_reg;
-
- e_dbg("e1000_phy_setup_autoneg");
-
- /* Read the MII Auto-Neg Advertisement Register (Address 4). */
- ret_val = e1000_read_phy_reg(hw, PHY_AUTONEG_ADV, &mii_autoneg_adv_reg);
- if (ret_val)
- return ret_val;
-
- /* Read the MII 1000Base-T Control Register (Address 9). */
- ret_val = e1000_read_phy_reg(hw, PHY_1000T_CTRL, &mii_1000t_ctrl_reg);
- if (ret_val)
- return ret_val;
- else if (hw->phy_type == e1000_phy_8201)
- mii_1000t_ctrl_reg &= ~REG9_SPEED_MASK;
-
- /* Need to parse both autoneg_advertised and fc and set up
- * the appropriate PHY registers. First we will parse for
- * autoneg_advertised software override. Since we can advertise
- * a plethora of combinations, we need to check each bit
- * individually.
- */
-
- /* First we clear all the 10/100 mb speed bits in the Auto-Neg
- * Advertisement Register (Address 4) and the 1000 mb speed bits in
- * the 1000Base-T Control Register (Address 9).
- */
- mii_autoneg_adv_reg &= ~REG4_SPEED_MASK;
- mii_1000t_ctrl_reg &= ~REG9_SPEED_MASK;
-
- e_dbg("autoneg_advertised %x\n", hw->autoneg_advertised);
-
- /* Do we want to advertise 10 Mb Half Duplex? */
- if (hw->autoneg_advertised & ADVERTISE_10_HALF) {
- e_dbg("Advertise 10mb Half duplex\n");
- mii_autoneg_adv_reg |= NWAY_AR_10T_HD_CAPS;
- }
-
- /* Do we want to advertise 10 Mb Full Duplex? */
- if (hw->autoneg_advertised & ADVERTISE_10_FULL) {
- e_dbg("Advertise 10mb Full duplex\n");
- mii_autoneg_adv_reg |= NWAY_AR_10T_FD_CAPS;
- }
-
- /* Do we want to advertise 100 Mb Half Duplex? */
- if (hw->autoneg_advertised & ADVERTISE_100_HALF) {
- e_dbg("Advertise 100mb Half duplex\n");
- mii_autoneg_adv_reg |= NWAY_AR_100TX_HD_CAPS;
- }
-
- /* Do we want to advertise 100 Mb Full Duplex? */
- if (hw->autoneg_advertised & ADVERTISE_100_FULL) {
- e_dbg("Advertise 100mb Full duplex\n");
- mii_autoneg_adv_reg |= NWAY_AR_100TX_FD_CAPS;
- }
-
- /* We do not allow the Phy to advertise 1000 Mb Half Duplex */
- if (hw->autoneg_advertised & ADVERTISE_1000_HALF) {
- e_dbg
- ("Advertise 1000mb Half duplex requested, request denied!\n");
- }
-
- /* Do we want to advertise 1000 Mb Full Duplex? */
- if (hw->autoneg_advertised & ADVERTISE_1000_FULL) {
- e_dbg("Advertise 1000mb Full duplex\n");
- mii_1000t_ctrl_reg |= CR_1000T_FD_CAPS;
- }
-
- /* Check for a software override of the flow control settings, and
- * setup the PHY advertisement registers accordingly. If
- * auto-negotiation is enabled, then software will have to set the
- * "PAUSE" bits to the correct value in the Auto-Negotiation
- * Advertisement Register (PHY_AUTONEG_ADV) and re-start auto-negotiation.
- *
- * The possible values of the "fc" parameter are:
- * 0: Flow control is completely disabled
- * 1: Rx flow control is enabled (we can receive pause frames
- * but not send pause frames).
- * 2: Tx flow control is enabled (we can send pause frames
- * but we do not support receiving pause frames).
- * 3: Both Rx and TX flow control (symmetric) are enabled.
- * other: No software override. The flow control configuration
- * in the EEPROM is used.
- */
- switch (hw->fc) {
- case E1000_FC_NONE: /* 0 */
- /* Flow control (RX & TX) is completely disabled by a
- * software over-ride.
- */
- mii_autoneg_adv_reg &= ~(NWAY_AR_ASM_DIR | NWAY_AR_PAUSE);
- break;
- case E1000_FC_RX_PAUSE: /* 1 */
- /* RX Flow control is enabled, and TX Flow control is
- * disabled, by a software over-ride.
- */
- /* Since there really isn't a way to advertise that we are
- * capable of RX Pause ONLY, we will advertise that we
- * support both symmetric and asymmetric RX PAUSE. Later
- * (in e1000_config_fc_after_link_up) we will disable the
- *hw's ability to send PAUSE frames.
- */
- mii_autoneg_adv_reg |= (NWAY_AR_ASM_DIR | NWAY_AR_PAUSE);
- break;
- case E1000_FC_TX_PAUSE: /* 2 */
- /* TX Flow control is enabled, and RX Flow control is
- * disabled, by a software over-ride.
- */
- mii_autoneg_adv_reg |= NWAY_AR_ASM_DIR;
- mii_autoneg_adv_reg &= ~NWAY_AR_PAUSE;
- break;
- case E1000_FC_FULL: /* 3 */
- /* Flow control (both RX and TX) is enabled by a software
- * over-ride.
- */
- mii_autoneg_adv_reg |= (NWAY_AR_ASM_DIR | NWAY_AR_PAUSE);
- break;
- default:
- e_dbg("Flow control param set incorrectly\n");
- return -E1000_ERR_CONFIG;
- }
-
- ret_val = e1000_write_phy_reg(hw, PHY_AUTONEG_ADV, mii_autoneg_adv_reg);
- if (ret_val)
- return ret_val;
-
- e_dbg("Auto-Neg Advertising %x\n", mii_autoneg_adv_reg);
-
- if (hw->phy_type == e1000_phy_8201) {
- mii_1000t_ctrl_reg = 0;
- } else {
- ret_val = e1000_write_phy_reg(hw, PHY_1000T_CTRL,
- mii_1000t_ctrl_reg);
- if (ret_val)
- return ret_val;
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_phy_force_speed_duplex - force link settings
- * @hw: Struct containing variables accessed by shared code
- *
- * Force PHY speed and duplex settings to hw->forced_speed_duplex
- */
-static s32 e1000_phy_force_speed_duplex(struct e1000_hw *hw)
-{
- u32 ctrl;
- s32 ret_val;
- u16 mii_ctrl_reg;
- u16 mii_status_reg;
- u16 phy_data;
- u16 i;
-
- e_dbg("e1000_phy_force_speed_duplex");
-
- /* Turn off Flow control if we are forcing speed and duplex. */
- hw->fc = E1000_FC_NONE;
-
- e_dbg("hw->fc = %d\n", hw->fc);
-
- /* Read the Device Control Register. */
- ctrl = er32(CTRL);
-
- /* Set the bits to Force Speed and Duplex in the Device Ctrl Reg. */
- ctrl |= (E1000_CTRL_FRCSPD | E1000_CTRL_FRCDPX);
- ctrl &= ~(DEVICE_SPEED_MASK);
-
- /* Clear the Auto Speed Detect Enable bit. */
- ctrl &= ~E1000_CTRL_ASDE;
-
- /* Read the MII Control Register. */
- ret_val = e1000_read_phy_reg(hw, PHY_CTRL, &mii_ctrl_reg);
- if (ret_val)
- return ret_val;
-
- /* We need to disable autoneg in order to force link and duplex. */
-
- mii_ctrl_reg &= ~MII_CR_AUTO_NEG_EN;
-
- /* Are we forcing Full or Half Duplex? */
- if (hw->forced_speed_duplex == e1000_100_full ||
- hw->forced_speed_duplex == e1000_10_full) {
- /* We want to force full duplex so we SET the full duplex bits in the
- * Device and MII Control Registers.
- */
- ctrl |= E1000_CTRL_FD;
- mii_ctrl_reg |= MII_CR_FULL_DUPLEX;
- e_dbg("Full Duplex\n");
- } else {
- /* We want to force half duplex so we CLEAR the full duplex bits in
- * the Device and MII Control Registers.
- */
- ctrl &= ~E1000_CTRL_FD;
- mii_ctrl_reg &= ~MII_CR_FULL_DUPLEX;
- e_dbg("Half Duplex\n");
- }
-
- /* Are we forcing 100Mbps??? */
- if (hw->forced_speed_duplex == e1000_100_full ||
- hw->forced_speed_duplex == e1000_100_half) {
- /* Set the 100Mb bit and turn off the 1000Mb and 10Mb bits. */
- ctrl |= E1000_CTRL_SPD_100;
- mii_ctrl_reg |= MII_CR_SPEED_100;
- mii_ctrl_reg &= ~(MII_CR_SPEED_1000 | MII_CR_SPEED_10);
- e_dbg("Forcing 100mb ");
- } else {
- /* Set the 10Mb bit and turn off the 1000Mb and 100Mb bits. */
- ctrl &= ~(E1000_CTRL_SPD_1000 | E1000_CTRL_SPD_100);
- mii_ctrl_reg |= MII_CR_SPEED_10;
- mii_ctrl_reg &= ~(MII_CR_SPEED_1000 | MII_CR_SPEED_100);
- e_dbg("Forcing 10mb ");
- }
-
- e1000_config_collision_dist(hw);
-
- /* Write the configured values back to the Device Control Reg. */
- ew32(CTRL, ctrl);
-
- if (hw->phy_type == e1000_phy_m88) {
- ret_val =
- e1000_read_phy_reg(hw, M88E1000_PHY_SPEC_CTRL, &phy_data);
- if (ret_val)
- return ret_val;
-
- /* Clear Auto-Crossover to force MDI manually. M88E1000 requires MDI
- * forced whenever speed are duplex are forced.
- */
- phy_data &= ~M88E1000_PSCR_AUTO_X_MODE;
- ret_val =
- e1000_write_phy_reg(hw, M88E1000_PHY_SPEC_CTRL, phy_data);
- if (ret_val)
- return ret_val;
-
- e_dbg("M88E1000 PSCR: %x\n", phy_data);
-
- /* Need to reset the PHY or these changes will be ignored */
- mii_ctrl_reg |= MII_CR_RESET;
-
- /* Disable MDI-X support for 10/100 */
- } else {
- /* Clear Auto-Crossover to force MDI manually. IGP requires MDI
- * forced whenever speed or duplex are forced.
- */
- ret_val =
- e1000_read_phy_reg(hw, IGP01E1000_PHY_PORT_CTRL, &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_data &= ~IGP01E1000_PSCR_AUTO_MDIX;
- phy_data &= ~IGP01E1000_PSCR_FORCE_MDI_MDIX;
-
- ret_val =
- e1000_write_phy_reg(hw, IGP01E1000_PHY_PORT_CTRL, phy_data);
- if (ret_val)
- return ret_val;
- }
-
- /* Write back the modified PHY MII control register. */
- ret_val = e1000_write_phy_reg(hw, PHY_CTRL, mii_ctrl_reg);
- if (ret_val)
- return ret_val;
-
- udelay(1);
-
- /* The wait_autoneg_complete flag may be a little misleading here.
- * Since we are forcing speed and duplex, Auto-Neg is not enabled.
- * But we do want to delay for a period while forcing only so we
- * don't generate false No Link messages. So we will wait here
- * only if the user has set wait_autoneg_complete to 1, which is
- * the default.
- */
- if (hw->wait_autoneg_complete) {
- /* We will wait for autoneg to complete. */
- e_dbg("Waiting for forced speed/duplex link.\n");
- mii_status_reg = 0;
-
- /* We will wait for autoneg to complete or 4.5 seconds to expire. */
- for (i = PHY_FORCE_TIME; i > 0; i--) {
- /* Read the MII Status Register and wait for Auto-Neg Complete bit
- * to be set.
- */
- ret_val =
- e1000_read_phy_reg(hw, PHY_STATUS, &mii_status_reg);
- if (ret_val)
- return ret_val;
-
- ret_val =
- e1000_read_phy_reg(hw, PHY_STATUS, &mii_status_reg);
- if (ret_val)
- return ret_val;
-
- if (mii_status_reg & MII_SR_LINK_STATUS)
- break;
- msleep(100);
- }
- if ((i == 0) && (hw->phy_type == e1000_phy_m88)) {
- /* We didn't get link. Reset the DSP and wait again for link. */
- ret_val = e1000_phy_reset_dsp(hw);
- if (ret_val) {
- e_dbg("Error Resetting PHY DSP\n");
- return ret_val;
- }
- }
- /* This loop will early-out if the link condition has been met. */
- for (i = PHY_FORCE_TIME; i > 0; i--) {
- if (mii_status_reg & MII_SR_LINK_STATUS)
- break;
- msleep(100);
- /* Read the MII Status Register and wait for Auto-Neg Complete bit
- * to be set.
- */
- ret_val =
- e1000_read_phy_reg(hw, PHY_STATUS, &mii_status_reg);
- if (ret_val)
- return ret_val;
-
- ret_val =
- e1000_read_phy_reg(hw, PHY_STATUS, &mii_status_reg);
- if (ret_val)
- return ret_val;
- }
- }
-
- if (hw->phy_type == e1000_phy_m88) {
- /* Because we reset the PHY above, we need to re-force TX_CLK in the
- * Extended PHY Specific Control Register to 25MHz clock. This value
- * defaults back to a 2.5MHz clock when the PHY is reset.
- */
- ret_val =
- e1000_read_phy_reg(hw, M88E1000_EXT_PHY_SPEC_CTRL,
- &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_data |= M88E1000_EPSCR_TX_CLK_25;
- ret_val =
- e1000_write_phy_reg(hw, M88E1000_EXT_PHY_SPEC_CTRL,
- phy_data);
- if (ret_val)
- return ret_val;
-
- /* In addition, because of the s/w reset above, we need to enable CRS on
- * TX. This must be set for both full and half duplex operation.
- */
- ret_val =
- e1000_read_phy_reg(hw, M88E1000_PHY_SPEC_CTRL, &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_data |= M88E1000_PSCR_ASSERT_CRS_ON_TX;
- ret_val =
- e1000_write_phy_reg(hw, M88E1000_PHY_SPEC_CTRL, phy_data);
- if (ret_val)
- return ret_val;
-
- if ((hw->mac_type == e1000_82544 || hw->mac_type == e1000_82543)
- && (!hw->autoneg)
- && (hw->forced_speed_duplex == e1000_10_full
- || hw->forced_speed_duplex == e1000_10_half)) {
- ret_val = e1000_polarity_reversal_workaround(hw);
- if (ret_val)
- return ret_val;
- }
- }
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_config_collision_dist - set collision distance register
- * @hw: Struct containing variables accessed by shared code
- *
- * Sets the collision distance in the Transmit Control register.
- * Link should have been established previously. Reads the speed and duplex
- * information from the Device Status register.
- */
-void e1000_config_collision_dist(struct e1000_hw *hw)
-{
- u32 tctl, coll_dist;
-
- e_dbg("e1000_config_collision_dist");
-
- if (hw->mac_type < e1000_82543)
- coll_dist = E1000_COLLISION_DISTANCE_82542;
- else
- coll_dist = E1000_COLLISION_DISTANCE;
-
- tctl = er32(TCTL);
-
- tctl &= ~E1000_TCTL_COLD;
- tctl |= coll_dist << E1000_COLD_SHIFT;
-
- ew32(TCTL, tctl);
- E1000_WRITE_FLUSH();
-}
-
-/**
- * e1000_config_mac_to_phy - sync phy and mac settings
- * @hw: Struct containing variables accessed by shared code
- * @mii_reg: data to write to the MII control register
- *
- * Sets MAC speed and duplex settings to reflect the those in the PHY
- * The contents of the PHY register containing the needed information need to
- * be passed in.
- */
-static s32 e1000_config_mac_to_phy(struct e1000_hw *hw)
-{
- u32 ctrl;
- s32 ret_val;
- u16 phy_data;
-
- e_dbg("e1000_config_mac_to_phy");
-
- /* 82544 or newer MAC, Auto Speed Detection takes care of
- * MAC speed/duplex configuration.*/
- if ((hw->mac_type >= e1000_82544) && (hw->mac_type != e1000_ce4100))
- return E1000_SUCCESS;
-
- /* Read the Device Control Register and set the bits to Force Speed
- * and Duplex.
- */
- ctrl = er32(CTRL);
- ctrl |= (E1000_CTRL_FRCSPD | E1000_CTRL_FRCDPX);
- ctrl &= ~(E1000_CTRL_SPD_SEL | E1000_CTRL_ILOS);
-
- switch (hw->phy_type) {
- case e1000_phy_8201:
- ret_val = e1000_read_phy_reg(hw, PHY_CTRL, &phy_data);
- if (ret_val)
- return ret_val;
-
- if (phy_data & RTL_PHY_CTRL_FD)
- ctrl |= E1000_CTRL_FD;
- else
- ctrl &= ~E1000_CTRL_FD;
-
- if (phy_data & RTL_PHY_CTRL_SPD_100)
- ctrl |= E1000_CTRL_SPD_100;
- else
- ctrl |= E1000_CTRL_SPD_10;
-
- e1000_config_collision_dist(hw);
- break;
- default:
- /* Set up duplex in the Device Control and Transmit Control
- * registers depending on negotiated values.
- */
- ret_val = e1000_read_phy_reg(hw, M88E1000_PHY_SPEC_STATUS,
- &phy_data);
- if (ret_val)
- return ret_val;
-
- if (phy_data & M88E1000_PSSR_DPLX)
- ctrl |= E1000_CTRL_FD;
- else
- ctrl &= ~E1000_CTRL_FD;
-
- e1000_config_collision_dist(hw);
-
- /* Set up speed in the Device Control register depending on
- * negotiated values.
- */
- if ((phy_data & M88E1000_PSSR_SPEED) == M88E1000_PSSR_1000MBS)
- ctrl |= E1000_CTRL_SPD_1000;
- else if ((phy_data & M88E1000_PSSR_SPEED) ==
- M88E1000_PSSR_100MBS)
- ctrl |= E1000_CTRL_SPD_100;
- }
-
- /* Write the configured values back to the Device Control Reg. */
- ew32(CTRL, ctrl);
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_force_mac_fc - force flow control settings
- * @hw: Struct containing variables accessed by shared code
- *
- * Forces the MAC's flow control settings.
- * Sets the TFCE and RFCE bits in the device control register to reflect
- * the adapter settings. TFCE and RFCE need to be explicitly set by
- * software when a Copper PHY is used because autonegotiation is managed
- * by the PHY rather than the MAC. Software must also configure these
- * bits when link is forced on a fiber connection.
- */
-s32 e1000_force_mac_fc(struct e1000_hw *hw)
-{
- u32 ctrl;
-
- e_dbg("e1000_force_mac_fc");
-
- /* Get the current configuration of the Device Control Register */
- ctrl = er32(CTRL);
-
- /* Because we didn't get link via the internal auto-negotiation
- * mechanism (we either forced link or we got link via PHY
- * auto-neg), we have to manually enable/disable transmit an
- * receive flow control.
- *
- * The "Case" statement below enables/disable flow control
- * according to the "hw->fc" parameter.
- *
- * The possible values of the "fc" parameter are:
- * 0: Flow control is completely disabled
- * 1: Rx flow control is enabled (we can receive pause
- * frames but not send pause frames).
- * 2: Tx flow control is enabled (we can send pause frames
- * frames but we do not receive pause frames).
- * 3: Both Rx and TX flow control (symmetric) is enabled.
- * other: No other values should be possible at this point.
- */
-
- switch (hw->fc) {
- case E1000_FC_NONE:
- ctrl &= (~(E1000_CTRL_TFCE | E1000_CTRL_RFCE));
- break;
- case E1000_FC_RX_PAUSE:
- ctrl &= (~E1000_CTRL_TFCE);
- ctrl |= E1000_CTRL_RFCE;
- break;
- case E1000_FC_TX_PAUSE:
- ctrl &= (~E1000_CTRL_RFCE);
- ctrl |= E1000_CTRL_TFCE;
- break;
- case E1000_FC_FULL:
- ctrl |= (E1000_CTRL_TFCE | E1000_CTRL_RFCE);
- break;
- default:
- e_dbg("Flow control param set incorrectly\n");
- return -E1000_ERR_CONFIG;
- }
-
- /* Disable TX Flow Control for 82542 (rev 2.0) */
- if (hw->mac_type == e1000_82542_rev2_0)
- ctrl &= (~E1000_CTRL_TFCE);
-
- ew32(CTRL, ctrl);
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_config_fc_after_link_up - configure flow control after autoneg
- * @hw: Struct containing variables accessed by shared code
- *
- * Configures flow control settings after link is established
- * Should be called immediately after a valid link has been established.
- * Forces MAC flow control settings if link was forced. When in MII/GMII mode
- * and autonegotiation is enabled, the MAC flow control settings will be set
- * based on the flow control negotiated by the PHY. In TBI mode, the TFCE
- * and RFCE bits will be automatically set to the negotiated flow control mode.
- */
-static s32 e1000_config_fc_after_link_up(struct e1000_hw *hw)
-{
- s32 ret_val;
- u16 mii_status_reg;
- u16 mii_nway_adv_reg;
- u16 mii_nway_lp_ability_reg;
- u16 speed;
- u16 duplex;
-
- e_dbg("e1000_config_fc_after_link_up");
-
- /* Check for the case where we have fiber media and auto-neg failed
- * so we had to force link. In this case, we need to force the
- * configuration of the MAC to match the "fc" parameter.
- */
- if (((hw->media_type == e1000_media_type_fiber) && (hw->autoneg_failed))
- || ((hw->media_type == e1000_media_type_internal_serdes)
- && (hw->autoneg_failed))
- || ((hw->media_type == e1000_media_type_copper)
- && (!hw->autoneg))) {
- ret_val = e1000_force_mac_fc(hw);
- if (ret_val) {
- e_dbg("Error forcing flow control settings\n");
- return ret_val;
- }
- }
-
- /* Check for the case where we have copper media and auto-neg is
- * enabled. In this case, we need to check and see if Auto-Neg
- * has completed, and if so, how the PHY and link partner has
- * flow control configured.
- */
- if ((hw->media_type == e1000_media_type_copper) && hw->autoneg) {
- /* Read the MII Status Register and check to see if AutoNeg
- * has completed. We read this twice because this reg has
- * some "sticky" (latched) bits.
- */
- ret_val = e1000_read_phy_reg(hw, PHY_STATUS, &mii_status_reg);
- if (ret_val)
- return ret_val;
- ret_val = e1000_read_phy_reg(hw, PHY_STATUS, &mii_status_reg);
- if (ret_val)
- return ret_val;
-
- if (mii_status_reg & MII_SR_AUTONEG_COMPLETE) {
- /* The AutoNeg process has completed, so we now need to
- * read both the Auto Negotiation Advertisement Register
- * (Address 4) and the Auto_Negotiation Base Page Ability
- * Register (Address 5) to determine how flow control was
- * negotiated.
- */
- ret_val = e1000_read_phy_reg(hw, PHY_AUTONEG_ADV,
- &mii_nway_adv_reg);
- if (ret_val)
- return ret_val;
- ret_val = e1000_read_phy_reg(hw, PHY_LP_ABILITY,
- &mii_nway_lp_ability_reg);
- if (ret_val)
- return ret_val;
-
- /* Two bits in the Auto Negotiation Advertisement Register
- * (Address 4) and two bits in the Auto Negotiation Base
- * Page Ability Register (Address 5) determine flow control
- * for both the PHY and the link partner. The following
- * table, taken out of the IEEE 802.3ab/D6.0 dated March 25,
- * 1999, describes these PAUSE resolution bits and how flow
- * control is determined based upon these settings.
- * NOTE: DC = Don't Care
- *
- * LOCAL DEVICE | LINK PARTNER
- * PAUSE | ASM_DIR | PAUSE | ASM_DIR | NIC Resolution
- *-------|---------|-------|---------|--------------------
- * 0 | 0 | DC | DC | E1000_FC_NONE
- * 0 | 1 | 0 | DC | E1000_FC_NONE
- * 0 | 1 | 1 | 0 | E1000_FC_NONE
- * 0 | 1 | 1 | 1 | E1000_FC_TX_PAUSE
- * 1 | 0 | 0 | DC | E1000_FC_NONE
- * 1 | DC | 1 | DC | E1000_FC_FULL
- * 1 | 1 | 0 | 0 | E1000_FC_NONE
- * 1 | 1 | 0 | 1 | E1000_FC_RX_PAUSE
- *
- */
- /* Are both PAUSE bits set to 1? If so, this implies
- * Symmetric Flow Control is enabled at both ends. The
- * ASM_DIR bits are irrelevant per the spec.
- *
- * For Symmetric Flow Control:
- *
- * LOCAL DEVICE | LINK PARTNER
- * PAUSE | ASM_DIR | PAUSE | ASM_DIR | Result
- *-------|---------|-------|---------|--------------------
- * 1 | DC | 1 | DC | E1000_FC_FULL
- *
- */
- if ((mii_nway_adv_reg & NWAY_AR_PAUSE) &&
- (mii_nway_lp_ability_reg & NWAY_LPAR_PAUSE)) {
- /* Now we need to check if the user selected RX ONLY
- * of pause frames. In this case, we had to advertise
- * FULL flow control because we could not advertise RX
- * ONLY. Hence, we must now check to see if we need to
- * turn OFF the TRANSMISSION of PAUSE frames.
- */
- if (hw->original_fc == E1000_FC_FULL) {
- hw->fc = E1000_FC_FULL;
- e_dbg("Flow Control = FULL.\n");
- } else {
- hw->fc = E1000_FC_RX_PAUSE;
- e_dbg
- ("Flow Control = RX PAUSE frames only.\n");
- }
- }
- /* For receiving PAUSE frames ONLY.
- *
- * LOCAL DEVICE | LINK PARTNER
- * PAUSE | ASM_DIR | PAUSE | ASM_DIR | Result
- *-------|---------|-------|---------|--------------------
- * 0 | 1 | 1 | 1 | E1000_FC_TX_PAUSE
- *
- */
- else if (!(mii_nway_adv_reg & NWAY_AR_PAUSE) &&
- (mii_nway_adv_reg & NWAY_AR_ASM_DIR) &&
- (mii_nway_lp_ability_reg & NWAY_LPAR_PAUSE) &&
- (mii_nway_lp_ability_reg & NWAY_LPAR_ASM_DIR))
- {
- hw->fc = E1000_FC_TX_PAUSE;
- e_dbg
- ("Flow Control = TX PAUSE frames only.\n");
- }
- /* For transmitting PAUSE frames ONLY.
- *
- * LOCAL DEVICE | LINK PARTNER
- * PAUSE | ASM_DIR | PAUSE | ASM_DIR | Result
- *-------|---------|-------|---------|--------------------
- * 1 | 1 | 0 | 1 | E1000_FC_RX_PAUSE
- *
- */
- else if ((mii_nway_adv_reg & NWAY_AR_PAUSE) &&
- (mii_nway_adv_reg & NWAY_AR_ASM_DIR) &&
- !(mii_nway_lp_ability_reg & NWAY_LPAR_PAUSE) &&
- (mii_nway_lp_ability_reg & NWAY_LPAR_ASM_DIR))
- {
- hw->fc = E1000_FC_RX_PAUSE;
- e_dbg
- ("Flow Control = RX PAUSE frames only.\n");
- }
- /* Per the IEEE spec, at this point flow control should be
- * disabled. However, we want to consider that we could
- * be connected to a legacy switch that doesn't advertise
- * desired flow control, but can be forced on the link
- * partner. So if we advertised no flow control, that is
- * what we will resolve to. If we advertised some kind of
- * receive capability (Rx Pause Only or Full Flow Control)
- * and the link partner advertised none, we will configure
- * ourselves to enable Rx Flow Control only. We can do
- * this safely for two reasons: If the link partner really
- * didn't want flow control enabled, and we enable Rx, no
- * harm done since we won't be receiving any PAUSE frames
- * anyway. If the intent on the link partner was to have
- * flow control enabled, then by us enabling RX only, we
- * can at least receive pause frames and process them.
- * This is a good idea because in most cases, since we are
- * predominantly a server NIC, more times than not we will
- * be asked to delay transmission of packets than asking
- * our link partner to pause transmission of frames.
- */
- else if ((hw->original_fc == E1000_FC_NONE ||
- hw->original_fc == E1000_FC_TX_PAUSE) ||
- hw->fc_strict_ieee) {
- hw->fc = E1000_FC_NONE;
- e_dbg("Flow Control = NONE.\n");
- } else {
- hw->fc = E1000_FC_RX_PAUSE;
- e_dbg
- ("Flow Control = RX PAUSE frames only.\n");
- }
-
- /* Now we need to do one last check... If we auto-
- * negotiated to HALF DUPLEX, flow control should not be
- * enabled per IEEE 802.3 spec.
- */
- ret_val =
- e1000_get_speed_and_duplex(hw, &speed, &duplex);
- if (ret_val) {
- e_dbg
- ("Error getting link speed and duplex\n");
- return ret_val;
- }
-
- if (duplex == HALF_DUPLEX)
- hw->fc = E1000_FC_NONE;
-
- /* Now we call a subroutine to actually force the MAC
- * controller to use the correct flow control settings.
- */
- ret_val = e1000_force_mac_fc(hw);
- if (ret_val) {
- e_dbg
- ("Error forcing flow control settings\n");
- return ret_val;
- }
- } else {
- e_dbg
- ("Copper PHY and Auto Neg has not completed.\n");
- }
- }
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_check_for_serdes_link_generic - Check for link (Serdes)
- * @hw: pointer to the HW structure
- *
- * Checks for link up on the hardware. If link is not up and we have
- * a signal, then we need to force link up.
- */
-static s32 e1000_check_for_serdes_link_generic(struct e1000_hw *hw)
-{
- u32 rxcw;
- u32 ctrl;
- u32 status;
- s32 ret_val = E1000_SUCCESS;
-
- e_dbg("e1000_check_for_serdes_link_generic");
-
- ctrl = er32(CTRL);
- status = er32(STATUS);
- rxcw = er32(RXCW);
-
- /*
- * If we don't have link (auto-negotiation failed or link partner
- * cannot auto-negotiate), and our link partner is not trying to
- * auto-negotiate with us (we are receiving idles or data),
- * we need to force link up. We also need to give auto-negotiation
- * time to complete.
- */
- /* (ctrl & E1000_CTRL_SWDPIN1) == 1 == have signal */
- if ((!(status & E1000_STATUS_LU)) && (!(rxcw & E1000_RXCW_C))) {
- if (hw->autoneg_failed == 0) {
- hw->autoneg_failed = 1;
- goto out;
- }
- e_dbg("NOT RXing /C/, disable AutoNeg and force link.\n");
-
- /* Disable auto-negotiation in the TXCW register */
- ew32(TXCW, (hw->txcw & ~E1000_TXCW_ANE));
-
- /* Force link-up and also force full-duplex. */
- ctrl = er32(CTRL);
- ctrl |= (E1000_CTRL_SLU | E1000_CTRL_FD);
- ew32(CTRL, ctrl);
-
- /* Configure Flow Control after forcing link up. */
- ret_val = e1000_config_fc_after_link_up(hw);
- if (ret_val) {
- e_dbg("Error configuring flow control\n");
- goto out;
- }
- } else if ((ctrl & E1000_CTRL_SLU) && (rxcw & E1000_RXCW_C)) {
- /*
- * If we are forcing link and we are receiving /C/ ordered
- * sets, re-enable auto-negotiation in the TXCW register
- * and disable forced link in the Device Control register
- * in an attempt to auto-negotiate with our link partner.
- */
- e_dbg("RXing /C/, enable AutoNeg and stop forcing link.\n");
- ew32(TXCW, hw->txcw);
- ew32(CTRL, (ctrl & ~E1000_CTRL_SLU));
-
- hw->serdes_has_link = true;
- } else if (!(E1000_TXCW_ANE & er32(TXCW))) {
- /*
- * If we force link for non-auto-negotiation switch, check
- * link status based on MAC synchronization for internal
- * serdes media type.
- */
- /* SYNCH bit and IV bit are sticky. */
- udelay(10);
- rxcw = er32(RXCW);
- if (rxcw & E1000_RXCW_SYNCH) {
- if (!(rxcw & E1000_RXCW_IV)) {
- hw->serdes_has_link = true;
- e_dbg("SERDES: Link up - forced.\n");
- }
- } else {
- hw->serdes_has_link = false;
- e_dbg("SERDES: Link down - force failed.\n");
- }
- }
-
- if (E1000_TXCW_ANE & er32(TXCW)) {
- status = er32(STATUS);
- if (status & E1000_STATUS_LU) {
- /* SYNCH bit and IV bit are sticky, so reread rxcw. */
- udelay(10);
- rxcw = er32(RXCW);
- if (rxcw & E1000_RXCW_SYNCH) {
- if (!(rxcw & E1000_RXCW_IV)) {
- hw->serdes_has_link = true;
- e_dbg("SERDES: Link up - autoneg "
- "completed successfully.\n");
- } else {
- hw->serdes_has_link = false;
- e_dbg("SERDES: Link down - invalid"
- "codewords detected in autoneg.\n");
- }
- } else {
- hw->serdes_has_link = false;
- e_dbg("SERDES: Link down - no sync.\n");
- }
- } else {
- hw->serdes_has_link = false;
- e_dbg("SERDES: Link down - autoneg failed\n");
- }
- }
-
- out:
- return ret_val;
-}
-
-/**
- * e1000_check_for_link
- * @hw: Struct containing variables accessed by shared code
- *
- * Checks to see if the link status of the hardware has changed.
- * Called by any function that needs to check the link status of the adapter.
- */
-s32 e1000_check_for_link(struct e1000_hw *hw)
-{
- u32 rxcw = 0;
- u32 ctrl;
- u32 status;
- u32 rctl;
- u32 icr;
- u32 signal = 0;
- s32 ret_val;
- u16 phy_data;
-
- e_dbg("e1000_check_for_link");
-
- ctrl = er32(CTRL);
- status = er32(STATUS);
-
- /* On adapters with a MAC newer than 82544, SW Definable pin 1 will be
- * set when the optics detect a signal. On older adapters, it will be
- * cleared when there is a signal. This applies to fiber media only.
- */
- if ((hw->media_type == e1000_media_type_fiber) ||
- (hw->media_type == e1000_media_type_internal_serdes)) {
- rxcw = er32(RXCW);
-
- if (hw->media_type == e1000_media_type_fiber) {
- signal =
- (hw->mac_type >
- e1000_82544) ? E1000_CTRL_SWDPIN1 : 0;
- if (status & E1000_STATUS_LU)
- hw->get_link_status = false;
- }
- }
-
- /* If we have a copper PHY then we only want to go out to the PHY
- * registers to see if Auto-Neg has completed and/or if our link
- * status has changed. The get_link_status flag will be set if we
- * receive a Link Status Change interrupt or we have Rx Sequence
- * Errors.
- */
- if ((hw->media_type == e1000_media_type_copper) && hw->get_link_status) {
- /* First we want to see if the MII Status Register reports
- * link. If so, then we want to get the current speed/duplex
- * of the PHY.
- * Read the register twice since the link bit is sticky.
- */
- ret_val = e1000_read_phy_reg(hw, PHY_STATUS, &phy_data);
- if (ret_val)
- return ret_val;
- ret_val = e1000_read_phy_reg(hw, PHY_STATUS, &phy_data);
- if (ret_val)
- return ret_val;
-
- if (phy_data & MII_SR_LINK_STATUS) {
- hw->get_link_status = false;
- /* Check if there was DownShift, must be checked immediately after
- * link-up */
- e1000_check_downshift(hw);
-
- /* If we are on 82544 or 82543 silicon and speed/duplex
- * are forced to 10H or 10F, then we will implement the polarity
- * reversal workaround. We disable interrupts first, and upon
- * returning, place the devices interrupt state to its previous
- * value except for the link status change interrupt which will
- * happen due to the execution of this workaround.
- */
-
- if ((hw->mac_type == e1000_82544
- || hw->mac_type == e1000_82543) && (!hw->autoneg)
- && (hw->forced_speed_duplex == e1000_10_full
- || hw->forced_speed_duplex == e1000_10_half)) {
- ew32(IMC, 0xffffffff);
- ret_val =
- e1000_polarity_reversal_workaround(hw);
- icr = er32(ICR);
- ew32(ICS, (icr & ~E1000_ICS_LSC));
- ew32(IMS, IMS_ENABLE_MASK);
- }
-
- } else {
- /* No link detected */
- e1000_config_dsp_after_link_change(hw, false);
- return 0;
- }
-
- /* If we are forcing speed/duplex, then we simply return since
- * we have already determined whether we have link or not.
- */
- if (!hw->autoneg)
- return -E1000_ERR_CONFIG;
-
- /* optimize the dsp settings for the igp phy */
- e1000_config_dsp_after_link_change(hw, true);
-
- /* We have a M88E1000 PHY and Auto-Neg is enabled. If we
- * have Si on board that is 82544 or newer, Auto
- * Speed Detection takes care of MAC speed/duplex
- * configuration. So we only need to configure Collision
- * Distance in the MAC. Otherwise, we need to force
- * speed/duplex on the MAC to the current PHY speed/duplex
- * settings.
- */
- if ((hw->mac_type >= e1000_82544) &&
- (hw->mac_type != e1000_ce4100))
- e1000_config_collision_dist(hw);
- else {
- ret_val = e1000_config_mac_to_phy(hw);
- if (ret_val) {
- e_dbg
- ("Error configuring MAC to PHY settings\n");
- return ret_val;
- }
- }
-
- /* Configure Flow Control now that Auto-Neg has completed. First, we
- * need to restore the desired flow control settings because we may
- * have had to re-autoneg with a different link partner.
- */
- ret_val = e1000_config_fc_after_link_up(hw);
- if (ret_val) {
- e_dbg("Error configuring flow control\n");
- return ret_val;
- }
-
- /* At this point we know that we are on copper and we have
- * auto-negotiated link. These are conditions for checking the link
- * partner capability register. We use the link speed to determine if
- * TBI compatibility needs to be turned on or off. If the link is not
- * at gigabit speed, then TBI compatibility is not needed. If we are
- * at gigabit speed, we turn on TBI compatibility.
- */
- if (hw->tbi_compatibility_en) {
- u16 speed, duplex;
- ret_val =
- e1000_get_speed_and_duplex(hw, &speed, &duplex);
- if (ret_val) {
- e_dbg
- ("Error getting link speed and duplex\n");
- return ret_val;
- }
- if (speed != SPEED_1000) {
- /* If link speed is not set to gigabit speed, we do not need
- * to enable TBI compatibility.
- */
- if (hw->tbi_compatibility_on) {
- /* If we previously were in the mode, turn it off. */
- rctl = er32(RCTL);
- rctl &= ~E1000_RCTL_SBP;
- ew32(RCTL, rctl);
- hw->tbi_compatibility_on = false;
- }
- } else {
- /* If TBI compatibility is was previously off, turn it on. For
- * compatibility with a TBI link partner, we will store bad
- * packets. Some frames have an additional byte on the end and
- * will look like CRC errors to to the hardware.
- */
- if (!hw->tbi_compatibility_on) {
- hw->tbi_compatibility_on = true;
- rctl = er32(RCTL);
- rctl |= E1000_RCTL_SBP;
- ew32(RCTL, rctl);
- }
- }
- }
- }
-
- if ((hw->media_type == e1000_media_type_fiber) ||
- (hw->media_type == e1000_media_type_internal_serdes))
- e1000_check_for_serdes_link_generic(hw);
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_get_speed_and_duplex
- * @hw: Struct containing variables accessed by shared code
- * @speed: Speed of the connection
- * @duplex: Duplex setting of the connection
-
- * Detects the current speed and duplex settings of the hardware.
- */
-s32 e1000_get_speed_and_duplex(struct e1000_hw *hw, u16 *speed, u16 *duplex)
-{
- u32 status;
- s32 ret_val;
- u16 phy_data;
-
- e_dbg("e1000_get_speed_and_duplex");
-
- if (hw->mac_type >= e1000_82543) {
- status = er32(STATUS);
- if (status & E1000_STATUS_SPEED_1000) {
- *speed = SPEED_1000;
- e_dbg("1000 Mbs, ");
- } else if (status & E1000_STATUS_SPEED_100) {
- *speed = SPEED_100;
- e_dbg("100 Mbs, ");
- } else {
- *speed = SPEED_10;
- e_dbg("10 Mbs, ");
- }
-
- if (status & E1000_STATUS_FD) {
- *duplex = FULL_DUPLEX;
- e_dbg("Full Duplex\n");
- } else {
- *duplex = HALF_DUPLEX;
- e_dbg(" Half Duplex\n");
- }
- } else {
- e_dbg("1000 Mbs, Full Duplex\n");
- *speed = SPEED_1000;
- *duplex = FULL_DUPLEX;
- }
-
- /* IGP01 PHY may advertise full duplex operation after speed downgrade even
- * if it is operating at half duplex. Here we set the duplex settings to
- * match the duplex in the link partner's capabilities.
- */
- if (hw->phy_type == e1000_phy_igp && hw->speed_downgraded) {
- ret_val = e1000_read_phy_reg(hw, PHY_AUTONEG_EXP, &phy_data);
- if (ret_val)
- return ret_val;
-
- if (!(phy_data & NWAY_ER_LP_NWAY_CAPS))
- *duplex = HALF_DUPLEX;
- else {
- ret_val =
- e1000_read_phy_reg(hw, PHY_LP_ABILITY, &phy_data);
- if (ret_val)
- return ret_val;
- if ((*speed == SPEED_100
- && !(phy_data & NWAY_LPAR_100TX_FD_CAPS))
- || (*speed == SPEED_10
- && !(phy_data & NWAY_LPAR_10T_FD_CAPS)))
- *duplex = HALF_DUPLEX;
- }
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_wait_autoneg
- * @hw: Struct containing variables accessed by shared code
- *
- * Blocks until autoneg completes or times out (~4.5 seconds)
- */
-static s32 e1000_wait_autoneg(struct e1000_hw *hw)
-{
- s32 ret_val;
- u16 i;
- u16 phy_data;
-
- e_dbg("e1000_wait_autoneg");
- e_dbg("Waiting for Auto-Neg to complete.\n");
-
- /* We will wait for autoneg to complete or 4.5 seconds to expire. */
- for (i = PHY_AUTO_NEG_TIME; i > 0; i--) {
- /* Read the MII Status Register and wait for Auto-Neg
- * Complete bit to be set.
- */
- ret_val = e1000_read_phy_reg(hw, PHY_STATUS, &phy_data);
- if (ret_val)
- return ret_val;
- ret_val = e1000_read_phy_reg(hw, PHY_STATUS, &phy_data);
- if (ret_val)
- return ret_val;
- if (phy_data & MII_SR_AUTONEG_COMPLETE) {
- return E1000_SUCCESS;
- }
- msleep(100);
- }
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_raise_mdi_clk - Raises the Management Data Clock
- * @hw: Struct containing variables accessed by shared code
- * @ctrl: Device control register's current value
- */
-static void e1000_raise_mdi_clk(struct e1000_hw *hw, u32 *ctrl)
-{
- /* Raise the clock input to the Management Data Clock (by setting the MDC
- * bit), and then delay 10 microseconds.
- */
- ew32(CTRL, (*ctrl | E1000_CTRL_MDC));
- E1000_WRITE_FLUSH();
- udelay(10);
-}
-
-/**
- * e1000_lower_mdi_clk - Lowers the Management Data Clock
- * @hw: Struct containing variables accessed by shared code
- * @ctrl: Device control register's current value
- */
-static void e1000_lower_mdi_clk(struct e1000_hw *hw, u32 *ctrl)
-{
- /* Lower the clock input to the Management Data Clock (by clearing the MDC
- * bit), and then delay 10 microseconds.
- */
- ew32(CTRL, (*ctrl & ~E1000_CTRL_MDC));
- E1000_WRITE_FLUSH();
- udelay(10);
-}
-
-/**
- * e1000_shift_out_mdi_bits - Shifts data bits out to the PHY
- * @hw: Struct containing variables accessed by shared code
- * @data: Data to send out to the PHY
- * @count: Number of bits to shift out
- *
- * Bits are shifted out in MSB to LSB order.
- */
-static void e1000_shift_out_mdi_bits(struct e1000_hw *hw, u32 data, u16 count)
-{
- u32 ctrl;
- u32 mask;
-
- /* We need to shift "count" number of bits out to the PHY. So, the value
- * in the "data" parameter will be shifted out to the PHY one bit at a
- * time. In order to do this, "data" must be broken down into bits.
- */
- mask = 0x01;
- mask <<= (count - 1);
-
- ctrl = er32(CTRL);
-
- /* Set MDIO_DIR and MDC_DIR direction bits to be used as output pins. */
- ctrl |= (E1000_CTRL_MDIO_DIR | E1000_CTRL_MDC_DIR);
-
- while (mask) {
- /* A "1" is shifted out to the PHY by setting the MDIO bit to "1" and
- * then raising and lowering the Management Data Clock. A "0" is
- * shifted out to the PHY by setting the MDIO bit to "0" and then
- * raising and lowering the clock.
- */
- if (data & mask)
- ctrl |= E1000_CTRL_MDIO;
- else
- ctrl &= ~E1000_CTRL_MDIO;
-
- ew32(CTRL, ctrl);
- E1000_WRITE_FLUSH();
-
- udelay(10);
-
- e1000_raise_mdi_clk(hw, &ctrl);
- e1000_lower_mdi_clk(hw, &ctrl);
-
- mask = mask >> 1;
- }
-}
-
-/**
- * e1000_shift_in_mdi_bits - Shifts data bits in from the PHY
- * @hw: Struct containing variables accessed by shared code
- *
- * Bits are shifted in in MSB to LSB order.
- */
-static u16 e1000_shift_in_mdi_bits(struct e1000_hw *hw)
-{
- u32 ctrl;
- u16 data = 0;
- u8 i;
-
- /* In order to read a register from the PHY, we need to shift in a total
- * of 18 bits from the PHY. The first two bit (turnaround) times are used
- * to avoid contention on the MDIO pin when a read operation is performed.
- * These two bits are ignored by us and thrown away. Bits are "shifted in"
- * by raising the input to the Management Data Clock (setting the MDC bit),
- * and then reading the value of the MDIO bit.
- */
- ctrl = er32(CTRL);
-
- /* Clear MDIO_DIR (SWDPIO1) to indicate this bit is to be used as input. */
- ctrl &= ~E1000_CTRL_MDIO_DIR;
- ctrl &= ~E1000_CTRL_MDIO;
-
- ew32(CTRL, ctrl);
- E1000_WRITE_FLUSH();
-
- /* Raise and Lower the clock before reading in the data. This accounts for
- * the turnaround bits. The first clock occurred when we clocked out the
- * last bit of the Register Address.
- */
- e1000_raise_mdi_clk(hw, &ctrl);
- e1000_lower_mdi_clk(hw, &ctrl);
-
- for (data = 0, i = 0; i < 16; i++) {
- data = data << 1;
- e1000_raise_mdi_clk(hw, &ctrl);
- ctrl = er32(CTRL);
- /* Check to see if we shifted in a "1". */
- if (ctrl & E1000_CTRL_MDIO)
- data |= 1;
- e1000_lower_mdi_clk(hw, &ctrl);
- }
-
- e1000_raise_mdi_clk(hw, &ctrl);
- e1000_lower_mdi_clk(hw, &ctrl);
-
- return data;
-}
-
-
-/**
- * e1000_read_phy_reg - read a phy register
- * @hw: Struct containing variables accessed by shared code
- * @reg_addr: address of the PHY register to read
- *
- * Reads the value from a PHY register, if the value is on a specific non zero
- * page, sets the page first.
- */
-s32 e1000_read_phy_reg(struct e1000_hw *hw, u32 reg_addr, u16 *phy_data)
-{
- u32 ret_val;
-
- e_dbg("e1000_read_phy_reg");
-
- if ((hw->phy_type == e1000_phy_igp) &&
- (reg_addr > MAX_PHY_MULTI_PAGE_REG)) {
- ret_val = e1000_write_phy_reg_ex(hw, IGP01E1000_PHY_PAGE_SELECT,
- (u16) reg_addr);
- if (ret_val)
- return ret_val;
- }
-
- ret_val = e1000_read_phy_reg_ex(hw, MAX_PHY_REG_ADDRESS & reg_addr,
- phy_data);
-
- return ret_val;
-}
-
-static s32 e1000_read_phy_reg_ex(struct e1000_hw *hw, u32 reg_addr,
- u16 *phy_data)
-{
- u32 i;
- u32 mdic = 0;
- const u32 phy_addr = (hw->mac_type == e1000_ce4100) ? hw->phy_addr : 1;
-
- e_dbg("e1000_read_phy_reg_ex");
-
- if (reg_addr > MAX_PHY_REG_ADDRESS) {
- e_dbg("PHY Address %d is out of range\n", reg_addr);
- return -E1000_ERR_PARAM;
- }
-
- if (hw->mac_type > e1000_82543) {
- /* Set up Op-code, Phy Address, and register address in the MDI
- * Control register. The MAC will take care of interfacing with the
- * PHY to retrieve the desired data.
- */
- if (hw->mac_type == e1000_ce4100) {
- mdic = ((reg_addr << E1000_MDIC_REG_SHIFT) |
- (phy_addr << E1000_MDIC_PHY_SHIFT) |
- (INTEL_CE_GBE_MDIC_OP_READ) |
- (INTEL_CE_GBE_MDIC_GO));
-
- writel(mdic, E1000_MDIO_CMD);
-
- /* Poll the ready bit to see if the MDI read
- * completed
- */
- for (i = 0; i < 64; i++) {
- udelay(50);
- mdic = readl(E1000_MDIO_CMD);
- if (!(mdic & INTEL_CE_GBE_MDIC_GO))
- break;
- }
-
- if (mdic & INTEL_CE_GBE_MDIC_GO) {
- e_dbg("MDI Read did not complete\n");
- return -E1000_ERR_PHY;
- }
-
- mdic = readl(E1000_MDIO_STS);
- if (mdic & INTEL_CE_GBE_MDIC_READ_ERROR) {
- e_dbg("MDI Read Error\n");
- return -E1000_ERR_PHY;
- }
- *phy_data = (u16) mdic;
- } else {
- mdic = ((reg_addr << E1000_MDIC_REG_SHIFT) |
- (phy_addr << E1000_MDIC_PHY_SHIFT) |
- (E1000_MDIC_OP_READ));
-
- ew32(MDIC, mdic);
-
- /* Poll the ready bit to see if the MDI read
- * completed
- */
- for (i = 0; i < 64; i++) {
- udelay(50);
- mdic = er32(MDIC);
- if (mdic & E1000_MDIC_READY)
- break;
- }
- if (!(mdic & E1000_MDIC_READY)) {
- e_dbg("MDI Read did not complete\n");
- return -E1000_ERR_PHY;
- }
- if (mdic & E1000_MDIC_ERROR) {
- e_dbg("MDI Error\n");
- return -E1000_ERR_PHY;
- }
- *phy_data = (u16) mdic;
- }
- } else {
- /* We must first send a preamble through the MDIO pin to signal the
- * beginning of an MII instruction. This is done by sending 32
- * consecutive "1" bits.
- */
- e1000_shift_out_mdi_bits(hw, PHY_PREAMBLE, PHY_PREAMBLE_SIZE);
-
- /* Now combine the next few fields that are required for a read
- * operation. We use this method instead of calling the
- * e1000_shift_out_mdi_bits routine five different times. The format of
- * a MII read instruction consists of a shift out of 14 bits and is
- * defined as follows:
- * <Preamble><SOF><Op Code><Phy Addr><Reg Addr>
- * followed by a shift in of 18 bits. This first two bits shifted in
- * are TurnAround bits used to avoid contention on the MDIO pin when a
- * READ operation is performed. These two bits are thrown away
- * followed by a shift in of 16 bits which contains the desired data.
- */
- mdic = ((reg_addr) | (phy_addr << 5) |
- (PHY_OP_READ << 10) | (PHY_SOF << 12));
-
- e1000_shift_out_mdi_bits(hw, mdic, 14);
-
- /* Now that we've shifted out the read command to the MII, we need to
- * "shift in" the 16-bit value (18 total bits) of the requested PHY
- * register address.
- */
- *phy_data = e1000_shift_in_mdi_bits(hw);
- }
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_write_phy_reg - write a phy register
- *
- * @hw: Struct containing variables accessed by shared code
- * @reg_addr: address of the PHY register to write
- * @data: data to write to the PHY
-
- * Writes a value to a PHY register
- */
-s32 e1000_write_phy_reg(struct e1000_hw *hw, u32 reg_addr, u16 phy_data)
-{
- u32 ret_val;
-
- e_dbg("e1000_write_phy_reg");
-
- if ((hw->phy_type == e1000_phy_igp) &&
- (reg_addr > MAX_PHY_MULTI_PAGE_REG)) {
- ret_val = e1000_write_phy_reg_ex(hw, IGP01E1000_PHY_PAGE_SELECT,
- (u16) reg_addr);
- if (ret_val)
- return ret_val;
- }
-
- ret_val = e1000_write_phy_reg_ex(hw, MAX_PHY_REG_ADDRESS & reg_addr,
- phy_data);
-
- return ret_val;
-}
-
-static s32 e1000_write_phy_reg_ex(struct e1000_hw *hw, u32 reg_addr,
- u16 phy_data)
-{
- u32 i;
- u32 mdic = 0;
- const u32 phy_addr = (hw->mac_type == e1000_ce4100) ? hw->phy_addr : 1;
-
- e_dbg("e1000_write_phy_reg_ex");
-
- if (reg_addr > MAX_PHY_REG_ADDRESS) {
- e_dbg("PHY Address %d is out of range\n", reg_addr);
- return -E1000_ERR_PARAM;
- }
-
- if (hw->mac_type > e1000_82543) {
- /* Set up Op-code, Phy Address, register address, and data
- * intended for the PHY register in the MDI Control register.
- * The MAC will take care of interfacing with the PHY to send
- * the desired data.
- */
- if (hw->mac_type == e1000_ce4100) {
- mdic = (((u32) phy_data) |
- (reg_addr << E1000_MDIC_REG_SHIFT) |
- (phy_addr << E1000_MDIC_PHY_SHIFT) |
- (INTEL_CE_GBE_MDIC_OP_WRITE) |
- (INTEL_CE_GBE_MDIC_GO));
-
- writel(mdic, E1000_MDIO_CMD);
-
- /* Poll the ready bit to see if the MDI read
- * completed
- */
- for (i = 0; i < 640; i++) {
- udelay(5);
- mdic = readl(E1000_MDIO_CMD);
- if (!(mdic & INTEL_CE_GBE_MDIC_GO))
- break;
- }
- if (mdic & INTEL_CE_GBE_MDIC_GO) {
- e_dbg("MDI Write did not complete\n");
- return -E1000_ERR_PHY;
- }
- } else {
- mdic = (((u32) phy_data) |
- (reg_addr << E1000_MDIC_REG_SHIFT) |
- (phy_addr << E1000_MDIC_PHY_SHIFT) |
- (E1000_MDIC_OP_WRITE));
-
- ew32(MDIC, mdic);
-
- /* Poll the ready bit to see if the MDI read
- * completed
- */
- for (i = 0; i < 641; i++) {
- udelay(5);
- mdic = er32(MDIC);
- if (mdic & E1000_MDIC_READY)
- break;
- }
- if (!(mdic & E1000_MDIC_READY)) {
- e_dbg("MDI Write did not complete\n");
- return -E1000_ERR_PHY;
- }
- }
- } else {
- /* We'll need to use the SW defined pins to shift the write command
- * out to the PHY. We first send a preamble to the PHY to signal the
- * beginning of the MII instruction. This is done by sending 32
- * consecutive "1" bits.
- */
- e1000_shift_out_mdi_bits(hw, PHY_PREAMBLE, PHY_PREAMBLE_SIZE);
-
- /* Now combine the remaining required fields that will indicate a
- * write operation. We use this method instead of calling the
- * e1000_shift_out_mdi_bits routine for each field in the command. The
- * format of a MII write instruction is as follows:
- * <Preamble><SOF><Op Code><Phy Addr><Reg Addr><Turnaround><Data>.
- */
- mdic = ((PHY_TURNAROUND) | (reg_addr << 2) | (phy_addr << 7) |
- (PHY_OP_WRITE << 12) | (PHY_SOF << 14));
- mdic <<= 16;
- mdic |= (u32) phy_data;
-
- e1000_shift_out_mdi_bits(hw, mdic, 32);
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_phy_hw_reset - reset the phy, hardware style
- * @hw: Struct containing variables accessed by shared code
- *
- * Returns the PHY to the power-on reset state
- */
-s32 e1000_phy_hw_reset(struct e1000_hw *hw)
-{
- u32 ctrl, ctrl_ext;
- u32 led_ctrl;
-
- e_dbg("e1000_phy_hw_reset");
-
- e_dbg("Resetting Phy...\n");
-
- if (hw->mac_type > e1000_82543) {
- /* Read the device control register and assert the E1000_CTRL_PHY_RST
- * bit. Then, take it out of reset.
- * For e1000 hardware, we delay for 10ms between the assert
- * and deassert.
- */
- ctrl = er32(CTRL);
- ew32(CTRL, ctrl | E1000_CTRL_PHY_RST);
- E1000_WRITE_FLUSH();
-
- msleep(10);
-
- ew32(CTRL, ctrl);
- E1000_WRITE_FLUSH();
-
- } else {
- /* Read the Extended Device Control Register, assert the PHY_RESET_DIR
- * bit to put the PHY into reset. Then, take it out of reset.
- */
- ctrl_ext = er32(CTRL_EXT);
- ctrl_ext |= E1000_CTRL_EXT_SDP4_DIR;
- ctrl_ext &= ~E1000_CTRL_EXT_SDP4_DATA;
- ew32(CTRL_EXT, ctrl_ext);
- E1000_WRITE_FLUSH();
- msleep(10);
- ctrl_ext |= E1000_CTRL_EXT_SDP4_DATA;
- ew32(CTRL_EXT, ctrl_ext);
- E1000_WRITE_FLUSH();
- }
- udelay(150);
-
- if ((hw->mac_type == e1000_82541) || (hw->mac_type == e1000_82547)) {
- /* Configure activity LED after PHY reset */
- led_ctrl = er32(LEDCTL);
- led_ctrl &= IGP_ACTIVITY_LED_MASK;
- led_ctrl |= (IGP_ACTIVITY_LED_ENABLE | IGP_LED3_MODE);
- ew32(LEDCTL, led_ctrl);
- }
-
- /* Wait for FW to finish PHY configuration. */
- return e1000_get_phy_cfg_done(hw);
-}
-
-/**
- * e1000_phy_reset - reset the phy to commit settings
- * @hw: Struct containing variables accessed by shared code
- *
- * Resets the PHY
- * Sets bit 15 of the MII Control register
- */
-s32 e1000_phy_reset(struct e1000_hw *hw)
-{
- s32 ret_val;
- u16 phy_data;
-
- e_dbg("e1000_phy_reset");
-
- switch (hw->phy_type) {
- case e1000_phy_igp:
- ret_val = e1000_phy_hw_reset(hw);
- if (ret_val)
- return ret_val;
- break;
- default:
- ret_val = e1000_read_phy_reg(hw, PHY_CTRL, &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_data |= MII_CR_RESET;
- ret_val = e1000_write_phy_reg(hw, PHY_CTRL, phy_data);
- if (ret_val)
- return ret_val;
-
- udelay(1);
- break;
- }
-
- if (hw->phy_type == e1000_phy_igp)
- e1000_phy_init_script(hw);
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_detect_gig_phy - check the phy type
- * @hw: Struct containing variables accessed by shared code
- *
- * Probes the expected PHY address for known PHY IDs
- */
-static s32 e1000_detect_gig_phy(struct e1000_hw *hw)
-{
- s32 phy_init_status, ret_val;
- u16 phy_id_high, phy_id_low;
- bool match = false;
-
- e_dbg("e1000_detect_gig_phy");
-
- if (hw->phy_id != 0)
- return E1000_SUCCESS;
-
- /* Read the PHY ID Registers to identify which PHY is onboard. */
- ret_val = e1000_read_phy_reg(hw, PHY_ID1, &phy_id_high);
- if (ret_val)
- return ret_val;
-
- hw->phy_id = (u32) (phy_id_high << 16);
- udelay(20);
- ret_val = e1000_read_phy_reg(hw, PHY_ID2, &phy_id_low);
- if (ret_val)
- return ret_val;
-
- hw->phy_id |= (u32) (phy_id_low & PHY_REVISION_MASK);
- hw->phy_revision = (u32) phy_id_low & ~PHY_REVISION_MASK;
-
- switch (hw->mac_type) {
- case e1000_82543:
- if (hw->phy_id == M88E1000_E_PHY_ID)
- match = true;
- break;
- case e1000_82544:
- if (hw->phy_id == M88E1000_I_PHY_ID)
- match = true;
- break;
- case e1000_82540:
- case e1000_82545:
- case e1000_82545_rev_3:
- case e1000_82546:
- case e1000_82546_rev_3:
- if (hw->phy_id == M88E1011_I_PHY_ID)
- match = true;
- break;
- case e1000_ce4100:
- if ((hw->phy_id == RTL8211B_PHY_ID) ||
- (hw->phy_id == RTL8201N_PHY_ID) ||
- (hw->phy_id == M88E1118_E_PHY_ID))
- match = true;
- break;
- case e1000_82541:
- case e1000_82541_rev_2:
- case e1000_82547:
- case e1000_82547_rev_2:
- if (hw->phy_id == IGP01E1000_I_PHY_ID)
- match = true;
- break;
- default:
- e_dbg("Invalid MAC type %d\n", hw->mac_type);
- return -E1000_ERR_CONFIG;
- }
- phy_init_status = e1000_set_phy_type(hw);
-
- if ((match) && (phy_init_status == E1000_SUCCESS)) {
- e_dbg("PHY ID 0x%X detected\n", hw->phy_id);
- return E1000_SUCCESS;
- }
- e_dbg("Invalid PHY ID 0x%X\n", hw->phy_id);
- return -E1000_ERR_PHY;
-}
-
-/**
- * e1000_phy_reset_dsp - reset DSP
- * @hw: Struct containing variables accessed by shared code
- *
- * Resets the PHY's DSP
- */
-static s32 e1000_phy_reset_dsp(struct e1000_hw *hw)
-{
- s32 ret_val;
- e_dbg("e1000_phy_reset_dsp");
-
- do {
- ret_val = e1000_write_phy_reg(hw, 29, 0x001d);
- if (ret_val)
- break;
- ret_val = e1000_write_phy_reg(hw, 30, 0x00c1);
- if (ret_val)
- break;
- ret_val = e1000_write_phy_reg(hw, 30, 0x0000);
- if (ret_val)
- break;
- ret_val = E1000_SUCCESS;
- } while (0);
-
- return ret_val;
-}
-
-/**
- * e1000_phy_igp_get_info - get igp specific registers
- * @hw: Struct containing variables accessed by shared code
- * @phy_info: PHY information structure
- *
- * Get PHY information from various PHY registers for igp PHY only.
- */
-static s32 e1000_phy_igp_get_info(struct e1000_hw *hw,
- struct e1000_phy_info *phy_info)
-{
- s32 ret_val;
- u16 phy_data, min_length, max_length, average;
- e1000_rev_polarity polarity;
-
- e_dbg("e1000_phy_igp_get_info");
-
- /* The downshift status is checked only once, after link is established,
- * and it stored in the hw->speed_downgraded parameter. */
- phy_info->downshift = (e1000_downshift) hw->speed_downgraded;
-
- /* IGP01E1000 does not need to support it. */
- phy_info->extended_10bt_distance = e1000_10bt_ext_dist_enable_normal;
-
- /* IGP01E1000 always correct polarity reversal */
- phy_info->polarity_correction = e1000_polarity_reversal_enabled;
-
- /* Check polarity status */
- ret_val = e1000_check_polarity(hw, &polarity);
- if (ret_val)
- return ret_val;
-
- phy_info->cable_polarity = polarity;
-
- ret_val = e1000_read_phy_reg(hw, IGP01E1000_PHY_PORT_STATUS, &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_info->mdix_mode =
- (e1000_auto_x_mode) ((phy_data & IGP01E1000_PSSR_MDIX) >>
- IGP01E1000_PSSR_MDIX_SHIFT);
-
- if ((phy_data & IGP01E1000_PSSR_SPEED_MASK) ==
- IGP01E1000_PSSR_SPEED_1000MBPS) {
- /* Local/Remote Receiver Information are only valid at 1000 Mbps */
- ret_val = e1000_read_phy_reg(hw, PHY_1000T_STATUS, &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_info->local_rx = ((phy_data & SR_1000T_LOCAL_RX_STATUS) >>
- SR_1000T_LOCAL_RX_STATUS_SHIFT) ?
- e1000_1000t_rx_status_ok : e1000_1000t_rx_status_not_ok;
- phy_info->remote_rx = ((phy_data & SR_1000T_REMOTE_RX_STATUS) >>
- SR_1000T_REMOTE_RX_STATUS_SHIFT) ?
- e1000_1000t_rx_status_ok : e1000_1000t_rx_status_not_ok;
-
- /* Get cable length */
- ret_val = e1000_get_cable_length(hw, &min_length, &max_length);
- if (ret_val)
- return ret_val;
-
- /* Translate to old method */
- average = (max_length + min_length) / 2;
-
- if (average <= e1000_igp_cable_length_50)
- phy_info->cable_length = e1000_cable_length_50;
- else if (average <= e1000_igp_cable_length_80)
- phy_info->cable_length = e1000_cable_length_50_80;
- else if (average <= e1000_igp_cable_length_110)
- phy_info->cable_length = e1000_cable_length_80_110;
- else if (average <= e1000_igp_cable_length_140)
- phy_info->cable_length = e1000_cable_length_110_140;
- else
- phy_info->cable_length = e1000_cable_length_140;
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_phy_m88_get_info - get m88 specific registers
- * @hw: Struct containing variables accessed by shared code
- * @phy_info: PHY information structure
- *
- * Get PHY information from various PHY registers for m88 PHY only.
- */
-static s32 e1000_phy_m88_get_info(struct e1000_hw *hw,
- struct e1000_phy_info *phy_info)
-{
- s32 ret_val;
- u16 phy_data;
- e1000_rev_polarity polarity;
-
- e_dbg("e1000_phy_m88_get_info");
-
- /* The downshift status is checked only once, after link is established,
- * and it stored in the hw->speed_downgraded parameter. */
- phy_info->downshift = (e1000_downshift) hw->speed_downgraded;
-
- ret_val = e1000_read_phy_reg(hw, M88E1000_PHY_SPEC_CTRL, &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_info->extended_10bt_distance =
- ((phy_data & M88E1000_PSCR_10BT_EXT_DIST_ENABLE) >>
- M88E1000_PSCR_10BT_EXT_DIST_ENABLE_SHIFT) ?
- e1000_10bt_ext_dist_enable_lower :
- e1000_10bt_ext_dist_enable_normal;
-
- phy_info->polarity_correction =
- ((phy_data & M88E1000_PSCR_POLARITY_REVERSAL) >>
- M88E1000_PSCR_POLARITY_REVERSAL_SHIFT) ?
- e1000_polarity_reversal_disabled : e1000_polarity_reversal_enabled;
-
- /* Check polarity status */
- ret_val = e1000_check_polarity(hw, &polarity);
- if (ret_val)
- return ret_val;
- phy_info->cable_polarity = polarity;
-
- ret_val = e1000_read_phy_reg(hw, M88E1000_PHY_SPEC_STATUS, &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_info->mdix_mode =
- (e1000_auto_x_mode) ((phy_data & M88E1000_PSSR_MDIX) >>
- M88E1000_PSSR_MDIX_SHIFT);
-
- if ((phy_data & M88E1000_PSSR_SPEED) == M88E1000_PSSR_1000MBS) {
- /* Cable Length Estimation and Local/Remote Receiver Information
- * are only valid at 1000 Mbps.
- */
- phy_info->cable_length =
- (e1000_cable_length) ((phy_data &
- M88E1000_PSSR_CABLE_LENGTH) >>
- M88E1000_PSSR_CABLE_LENGTH_SHIFT);
-
- ret_val = e1000_read_phy_reg(hw, PHY_1000T_STATUS, &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_info->local_rx = ((phy_data & SR_1000T_LOCAL_RX_STATUS) >>
- SR_1000T_LOCAL_RX_STATUS_SHIFT) ?
- e1000_1000t_rx_status_ok : e1000_1000t_rx_status_not_ok;
- phy_info->remote_rx = ((phy_data & SR_1000T_REMOTE_RX_STATUS) >>
- SR_1000T_REMOTE_RX_STATUS_SHIFT) ?
- e1000_1000t_rx_status_ok : e1000_1000t_rx_status_not_ok;
-
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_phy_get_info - request phy info
- * @hw: Struct containing variables accessed by shared code
- * @phy_info: PHY information structure
- *
- * Get PHY information from various PHY registers
- */
-s32 e1000_phy_get_info(struct e1000_hw *hw, struct e1000_phy_info *phy_info)
-{
- s32 ret_val;
- u16 phy_data;
-
- e_dbg("e1000_phy_get_info");
-
- phy_info->cable_length = e1000_cable_length_undefined;
- phy_info->extended_10bt_distance = e1000_10bt_ext_dist_enable_undefined;
- phy_info->cable_polarity = e1000_rev_polarity_undefined;
- phy_info->downshift = e1000_downshift_undefined;
- phy_info->polarity_correction = e1000_polarity_reversal_undefined;
- phy_info->mdix_mode = e1000_auto_x_mode_undefined;
- phy_info->local_rx = e1000_1000t_rx_status_undefined;
- phy_info->remote_rx = e1000_1000t_rx_status_undefined;
-
- if (hw->media_type != e1000_media_type_copper) {
- e_dbg("PHY info is only valid for copper media\n");
- return -E1000_ERR_CONFIG;
- }
-
- ret_val = e1000_read_phy_reg(hw, PHY_STATUS, &phy_data);
- if (ret_val)
- return ret_val;
-
- ret_val = e1000_read_phy_reg(hw, PHY_STATUS, &phy_data);
- if (ret_val)
- return ret_val;
-
- if ((phy_data & MII_SR_LINK_STATUS) != MII_SR_LINK_STATUS) {
- e_dbg("PHY info is only valid if link is up\n");
- return -E1000_ERR_CONFIG;
- }
-
- if (hw->phy_type == e1000_phy_igp)
- return e1000_phy_igp_get_info(hw, phy_info);
- else if ((hw->phy_type == e1000_phy_8211) ||
- (hw->phy_type == e1000_phy_8201))
- return E1000_SUCCESS;
- else
- return e1000_phy_m88_get_info(hw, phy_info);
-}
-
-s32 e1000_validate_mdi_setting(struct e1000_hw *hw)
-{
- e_dbg("e1000_validate_mdi_settings");
-
- if (!hw->autoneg && (hw->mdix == 0 || hw->mdix == 3)) {
- e_dbg("Invalid MDI setting detected\n");
- hw->mdix = 1;
- return -E1000_ERR_CONFIG;
- }
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_init_eeprom_params - initialize sw eeprom vars
- * @hw: Struct containing variables accessed by shared code
- *
- * Sets up eeprom variables in the hw struct. Must be called after mac_type
- * is configured.
- */
-s32 e1000_init_eeprom_params(struct e1000_hw *hw)
-{
- struct e1000_eeprom_info *eeprom = &hw->eeprom;
- u32 eecd = er32(EECD);
- s32 ret_val = E1000_SUCCESS;
- u16 eeprom_size;
-
- e_dbg("e1000_init_eeprom_params");
-
- switch (hw->mac_type) {
- case e1000_82542_rev2_0:
- case e1000_82542_rev2_1:
- case e1000_82543:
- case e1000_82544:
- eeprom->type = e1000_eeprom_microwire;
- eeprom->word_size = 64;
- eeprom->opcode_bits = 3;
- eeprom->address_bits = 6;
- eeprom->delay_usec = 50;
- break;
- case e1000_82540:
- case e1000_82545:
- case e1000_82545_rev_3:
- case e1000_82546:
- case e1000_82546_rev_3:
- eeprom->type = e1000_eeprom_microwire;
- eeprom->opcode_bits = 3;
- eeprom->delay_usec = 50;
- if (eecd & E1000_EECD_SIZE) {
- eeprom->word_size = 256;
- eeprom->address_bits = 8;
- } else {
- eeprom->word_size = 64;
- eeprom->address_bits = 6;
- }
- break;
- case e1000_82541:
- case e1000_82541_rev_2:
- case e1000_82547:
- case e1000_82547_rev_2:
- if (eecd & E1000_EECD_TYPE) {
- eeprom->type = e1000_eeprom_spi;
- eeprom->opcode_bits = 8;
- eeprom->delay_usec = 1;
- if (eecd & E1000_EECD_ADDR_BITS) {
- eeprom->page_size = 32;
- eeprom->address_bits = 16;
- } else {
- eeprom->page_size = 8;
- eeprom->address_bits = 8;
- }
- } else {
- eeprom->type = e1000_eeprom_microwire;
- eeprom->opcode_bits = 3;
- eeprom->delay_usec = 50;
- if (eecd & E1000_EECD_ADDR_BITS) {
- eeprom->word_size = 256;
- eeprom->address_bits = 8;
- } else {
- eeprom->word_size = 64;
- eeprom->address_bits = 6;
- }
- }
- break;
- default:
- break;
- }
-
- if (eeprom->type == e1000_eeprom_spi) {
- /* eeprom_size will be an enum [0..8] that maps to eeprom sizes 128B to
- * 32KB (incremented by powers of 2).
- */
- /* Set to default value for initial eeprom read. */
- eeprom->word_size = 64;
- ret_val = e1000_read_eeprom(hw, EEPROM_CFG, 1, &eeprom_size);
- if (ret_val)
- return ret_val;
- eeprom_size =
- (eeprom_size & EEPROM_SIZE_MASK) >> EEPROM_SIZE_SHIFT;
- /* 256B eeprom size was not supported in earlier hardware, so we
- * bump eeprom_size up one to ensure that "1" (which maps to 256B)
- * is never the result used in the shifting logic below. */
- if (eeprom_size)
- eeprom_size++;
-
- eeprom->word_size = 1 << (eeprom_size + EEPROM_WORD_SIZE_SHIFT);
- }
- return ret_val;
-}
-
-/**
- * e1000_raise_ee_clk - Raises the EEPROM's clock input.
- * @hw: Struct containing variables accessed by shared code
- * @eecd: EECD's current value
- */
-static void e1000_raise_ee_clk(struct e1000_hw *hw, u32 *eecd)
-{
- /* Raise the clock input to the EEPROM (by setting the SK bit), and then
- * wait <delay> microseconds.
- */
- *eecd = *eecd | E1000_EECD_SK;
- ew32(EECD, *eecd);
- E1000_WRITE_FLUSH();
- udelay(hw->eeprom.delay_usec);
-}
-
-/**
- * e1000_lower_ee_clk - Lowers the EEPROM's clock input.
- * @hw: Struct containing variables accessed by shared code
- * @eecd: EECD's current value
- */
-static void e1000_lower_ee_clk(struct e1000_hw *hw, u32 *eecd)
-{
- /* Lower the clock input to the EEPROM (by clearing the SK bit), and then
- * wait 50 microseconds.
- */
- *eecd = *eecd & ~E1000_EECD_SK;
- ew32(EECD, *eecd);
- E1000_WRITE_FLUSH();
- udelay(hw->eeprom.delay_usec);
-}
-
-/**
- * e1000_shift_out_ee_bits - Shift data bits out to the EEPROM.
- * @hw: Struct containing variables accessed by shared code
- * @data: data to send to the EEPROM
- * @count: number of bits to shift out
- */
-static void e1000_shift_out_ee_bits(struct e1000_hw *hw, u16 data, u16 count)
-{
- struct e1000_eeprom_info *eeprom = &hw->eeprom;
- u32 eecd;
- u32 mask;
-
- /* We need to shift "count" bits out to the EEPROM. So, value in the
- * "data" parameter will be shifted out to the EEPROM one bit at a time.
- * In order to do this, "data" must be broken down into bits.
- */
- mask = 0x01 << (count - 1);
- eecd = er32(EECD);
- if (eeprom->type == e1000_eeprom_microwire) {
- eecd &= ~E1000_EECD_DO;
- } else if (eeprom->type == e1000_eeprom_spi) {
- eecd |= E1000_EECD_DO;
- }
- do {
- /* A "1" is shifted out to the EEPROM by setting bit "DI" to a "1",
- * and then raising and then lowering the clock (the SK bit controls
- * the clock input to the EEPROM). A "0" is shifted out to the EEPROM
- * by setting "DI" to "0" and then raising and then lowering the clock.
- */
- eecd &= ~E1000_EECD_DI;
-
- if (data & mask)
- eecd |= E1000_EECD_DI;
-
- ew32(EECD, eecd);
- E1000_WRITE_FLUSH();
-
- udelay(eeprom->delay_usec);
-
- e1000_raise_ee_clk(hw, &eecd);
- e1000_lower_ee_clk(hw, &eecd);
-
- mask = mask >> 1;
-
- } while (mask);
-
- /* We leave the "DI" bit set to "0" when we leave this routine. */
- eecd &= ~E1000_EECD_DI;
- ew32(EECD, eecd);
-}
-
-/**
- * e1000_shift_in_ee_bits - Shift data bits in from the EEPROM
- * @hw: Struct containing variables accessed by shared code
- * @count: number of bits to shift in
- */
-static u16 e1000_shift_in_ee_bits(struct e1000_hw *hw, u16 count)
-{
- u32 eecd;
- u32 i;
- u16 data;
-
- /* In order to read a register from the EEPROM, we need to shift 'count'
- * bits in from the EEPROM. Bits are "shifted in" by raising the clock
- * input to the EEPROM (setting the SK bit), and then reading the value of
- * the "DO" bit. During this "shifting in" process the "DI" bit should
- * always be clear.
- */
-
- eecd = er32(EECD);
-
- eecd &= ~(E1000_EECD_DO | E1000_EECD_DI);
- data = 0;
-
- for (i = 0; i < count; i++) {
- data = data << 1;
- e1000_raise_ee_clk(hw, &eecd);
-
- eecd = er32(EECD);
-
- eecd &= ~(E1000_EECD_DI);
- if (eecd & E1000_EECD_DO)
- data |= 1;
-
- e1000_lower_ee_clk(hw, &eecd);
- }
-
- return data;
-}
-
-/**
- * e1000_acquire_eeprom - Prepares EEPROM for access
- * @hw: Struct containing variables accessed by shared code
- *
- * Lowers EEPROM clock. Clears input pin. Sets the chip select pin. This
- * function should be called before issuing a command to the EEPROM.
- */
-static s32 e1000_acquire_eeprom(struct e1000_hw *hw)
-{
- struct e1000_eeprom_info *eeprom = &hw->eeprom;
- u32 eecd, i = 0;
-
- e_dbg("e1000_acquire_eeprom");
-
- eecd = er32(EECD);
-
- /* Request EEPROM Access */
- if (hw->mac_type > e1000_82544) {
- eecd |= E1000_EECD_REQ;
- ew32(EECD, eecd);
- eecd = er32(EECD);
- while ((!(eecd & E1000_EECD_GNT)) &&
- (i < E1000_EEPROM_GRANT_ATTEMPTS)) {
- i++;
- udelay(5);
- eecd = er32(EECD);
- }
- if (!(eecd & E1000_EECD_GNT)) {
- eecd &= ~E1000_EECD_REQ;
- ew32(EECD, eecd);
- e_dbg("Could not acquire EEPROM grant\n");
- return -E1000_ERR_EEPROM;
- }
- }
-
- /* Setup EEPROM for Read/Write */
-
- if (eeprom->type == e1000_eeprom_microwire) {
- /* Clear SK and DI */
- eecd &= ~(E1000_EECD_DI | E1000_EECD_SK);
- ew32(EECD, eecd);
-
- /* Set CS */
- eecd |= E1000_EECD_CS;
- ew32(EECD, eecd);
- } else if (eeprom->type == e1000_eeprom_spi) {
- /* Clear SK and CS */
- eecd &= ~(E1000_EECD_CS | E1000_EECD_SK);
- ew32(EECD, eecd);
- udelay(1);
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_standby_eeprom - Returns EEPROM to a "standby" state
- * @hw: Struct containing variables accessed by shared code
- */
-static void e1000_standby_eeprom(struct e1000_hw *hw)
-{
- struct e1000_eeprom_info *eeprom = &hw->eeprom;
- u32 eecd;
-
- eecd = er32(EECD);
-
- if (eeprom->type == e1000_eeprom_microwire) {
- eecd &= ~(E1000_EECD_CS | E1000_EECD_SK);
- ew32(EECD, eecd);
- E1000_WRITE_FLUSH();
- udelay(eeprom->delay_usec);
-
- /* Clock high */
- eecd |= E1000_EECD_SK;
- ew32(EECD, eecd);
- E1000_WRITE_FLUSH();
- udelay(eeprom->delay_usec);
-
- /* Select EEPROM */
- eecd |= E1000_EECD_CS;
- ew32(EECD, eecd);
- E1000_WRITE_FLUSH();
- udelay(eeprom->delay_usec);
-
- /* Clock low */
- eecd &= ~E1000_EECD_SK;
- ew32(EECD, eecd);
- E1000_WRITE_FLUSH();
- udelay(eeprom->delay_usec);
- } else if (eeprom->type == e1000_eeprom_spi) {
- /* Toggle CS to flush commands */
- eecd |= E1000_EECD_CS;
- ew32(EECD, eecd);
- E1000_WRITE_FLUSH();
- udelay(eeprom->delay_usec);
- eecd &= ~E1000_EECD_CS;
- ew32(EECD, eecd);
- E1000_WRITE_FLUSH();
- udelay(eeprom->delay_usec);
- }
-}
-
-/**
- * e1000_release_eeprom - drop chip select
- * @hw: Struct containing variables accessed by shared code
- *
- * Terminates a command by inverting the EEPROM's chip select pin
- */
-static void e1000_release_eeprom(struct e1000_hw *hw)
-{
- u32 eecd;
-
- e_dbg("e1000_release_eeprom");
-
- eecd = er32(EECD);
-
- if (hw->eeprom.type == e1000_eeprom_spi) {
- eecd |= E1000_EECD_CS; /* Pull CS high */
- eecd &= ~E1000_EECD_SK; /* Lower SCK */
-
- ew32(EECD, eecd);
-
- udelay(hw->eeprom.delay_usec);
- } else if (hw->eeprom.type == e1000_eeprom_microwire) {
- /* cleanup eeprom */
-
- /* CS on Microwire is active-high */
- eecd &= ~(E1000_EECD_CS | E1000_EECD_DI);
-
- ew32(EECD, eecd);
-
- /* Rising edge of clock */
- eecd |= E1000_EECD_SK;
- ew32(EECD, eecd);
- E1000_WRITE_FLUSH();
- udelay(hw->eeprom.delay_usec);
-
- /* Falling edge of clock */
- eecd &= ~E1000_EECD_SK;
- ew32(EECD, eecd);
- E1000_WRITE_FLUSH();
- udelay(hw->eeprom.delay_usec);
- }
-
- /* Stop requesting EEPROM access */
- if (hw->mac_type > e1000_82544) {
- eecd &= ~E1000_EECD_REQ;
- ew32(EECD, eecd);
- }
-}
-
-/**
- * e1000_spi_eeprom_ready - Reads a 16 bit word from the EEPROM.
- * @hw: Struct containing variables accessed by shared code
- */
-static s32 e1000_spi_eeprom_ready(struct e1000_hw *hw)
-{
- u16 retry_count = 0;
- u8 spi_stat_reg;
-
- e_dbg("e1000_spi_eeprom_ready");
-
- /* Read "Status Register" repeatedly until the LSB is cleared. The
- * EEPROM will signal that the command has been completed by clearing
- * bit 0 of the internal status register. If it's not cleared within
- * 5 milliseconds, then error out.
- */
- retry_count = 0;
- do {
- e1000_shift_out_ee_bits(hw, EEPROM_RDSR_OPCODE_SPI,
- hw->eeprom.opcode_bits);
- spi_stat_reg = (u8) e1000_shift_in_ee_bits(hw, 8);
- if (!(spi_stat_reg & EEPROM_STATUS_RDY_SPI))
- break;
-
- udelay(5);
- retry_count += 5;
-
- e1000_standby_eeprom(hw);
- } while (retry_count < EEPROM_MAX_RETRY_SPI);
-
- /* ATMEL SPI write time could vary from 0-20mSec on 3.3V devices (and
- * only 0-5mSec on 5V devices)
- */
- if (retry_count >= EEPROM_MAX_RETRY_SPI) {
- e_dbg("SPI EEPROM Status error\n");
- return -E1000_ERR_EEPROM;
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_read_eeprom - Reads a 16 bit word from the EEPROM.
- * @hw: Struct containing variables accessed by shared code
- * @offset: offset of word in the EEPROM to read
- * @data: word read from the EEPROM
- * @words: number of words to read
- */
-s32 e1000_read_eeprom(struct e1000_hw *hw, u16 offset, u16 words, u16 *data)
-{
- s32 ret;
- spin_lock(&e1000_eeprom_lock);
- ret = e1000_do_read_eeprom(hw, offset, words, data);
- spin_unlock(&e1000_eeprom_lock);
- return ret;
-}
-
-static s32 e1000_do_read_eeprom(struct e1000_hw *hw, u16 offset, u16 words,
- u16 *data)
-{
- struct e1000_eeprom_info *eeprom = &hw->eeprom;
- u32 i = 0;
-
- e_dbg("e1000_read_eeprom");
-
- if (hw->mac_type == e1000_ce4100) {
- GBE_CONFIG_FLASH_READ(GBE_CONFIG_BASE_VIRT, offset, words,
- data);
- return E1000_SUCCESS;
- }
-
- /* If eeprom is not yet detected, do so now */
- if (eeprom->word_size == 0)
- e1000_init_eeprom_params(hw);
-
- /* A check for invalid values: offset too large, too many words, and not
- * enough words.
- */
- if ((offset >= eeprom->word_size)
- || (words > eeprom->word_size - offset) || (words == 0)) {
- e_dbg("\"words\" parameter out of bounds. Words = %d,"
- "size = %d\n", offset, eeprom->word_size);
- return -E1000_ERR_EEPROM;
- }
-
- /* EEPROM's that don't use EERD to read require us to bit-bang the SPI
- * directly. In this case, we need to acquire the EEPROM so that
- * FW or other port software does not interrupt.
- */
- /* Prepare the EEPROM for bit-bang reading */
- if (e1000_acquire_eeprom(hw) != E1000_SUCCESS)
- return -E1000_ERR_EEPROM;
-
- /* Set up the SPI or Microwire EEPROM for bit-bang reading. We have
- * acquired the EEPROM at this point, so any returns should release it */
- if (eeprom->type == e1000_eeprom_spi) {
- u16 word_in;
- u8 read_opcode = EEPROM_READ_OPCODE_SPI;
-
- if (e1000_spi_eeprom_ready(hw)) {
- e1000_release_eeprom(hw);
- return -E1000_ERR_EEPROM;
- }
-
- e1000_standby_eeprom(hw);
-
- /* Some SPI eeproms use the 8th address bit embedded in the opcode */
- if ((eeprom->address_bits == 8) && (offset >= 128))
- read_opcode |= EEPROM_A8_OPCODE_SPI;
-
- /* Send the READ command (opcode + addr) */
- e1000_shift_out_ee_bits(hw, read_opcode, eeprom->opcode_bits);
- e1000_shift_out_ee_bits(hw, (u16) (offset * 2),
- eeprom->address_bits);
-
- /* Read the data. The address of the eeprom internally increments with
- * each byte (spi) being read, saving on the overhead of eeprom setup
- * and tear-down. The address counter will roll over if reading beyond
- * the size of the eeprom, thus allowing the entire memory to be read
- * starting from any offset. */
- for (i = 0; i < words; i++) {
- word_in = e1000_shift_in_ee_bits(hw, 16);
- data[i] = (word_in >> 8) | (word_in << 8);
- }
- } else if (eeprom->type == e1000_eeprom_microwire) {
- for (i = 0; i < words; i++) {
- /* Send the READ command (opcode + addr) */
- e1000_shift_out_ee_bits(hw,
- EEPROM_READ_OPCODE_MICROWIRE,
- eeprom->opcode_bits);
- e1000_shift_out_ee_bits(hw, (u16) (offset + i),
- eeprom->address_bits);
-
- /* Read the data. For microwire, each word requires the overhead
- * of eeprom setup and tear-down. */
- data[i] = e1000_shift_in_ee_bits(hw, 16);
- e1000_standby_eeprom(hw);
- }
- }
-
- /* End this read operation */
- e1000_release_eeprom(hw);
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_validate_eeprom_checksum - Verifies that the EEPROM has a valid checksum
- * @hw: Struct containing variables accessed by shared code
- *
- * Reads the first 64 16 bit words of the EEPROM and sums the values read.
- * If the the sum of the 64 16 bit words is 0xBABA, the EEPROM's checksum is
- * valid.
- */
-s32 e1000_validate_eeprom_checksum(struct e1000_hw *hw)
-{
- u16 checksum = 0;
- u16 i, eeprom_data;
-
- e_dbg("e1000_validate_eeprom_checksum");
-
- for (i = 0; i < (EEPROM_CHECKSUM_REG + 1); i++) {
- if (e1000_read_eeprom(hw, i, 1, &eeprom_data) < 0) {
- e_dbg("EEPROM Read Error\n");
- return -E1000_ERR_EEPROM;
- }
- checksum += eeprom_data;
- }
-
- if (checksum == (u16) EEPROM_SUM)
- return E1000_SUCCESS;
- else {
- e_dbg("EEPROM Checksum Invalid\n");
- return -E1000_ERR_EEPROM;
- }
-}
-
-/**
- * e1000_update_eeprom_checksum - Calculates/writes the EEPROM checksum
- * @hw: Struct containing variables accessed by shared code
- *
- * Sums the first 63 16 bit words of the EEPROM. Subtracts the sum from 0xBABA.
- * Writes the difference to word offset 63 of the EEPROM.
- */
-s32 e1000_update_eeprom_checksum(struct e1000_hw *hw)
-{
- u16 checksum = 0;
- u16 i, eeprom_data;
-
- e_dbg("e1000_update_eeprom_checksum");
-
- for (i = 0; i < EEPROM_CHECKSUM_REG; i++) {
- if (e1000_read_eeprom(hw, i, 1, &eeprom_data) < 0) {
- e_dbg("EEPROM Read Error\n");
- return -E1000_ERR_EEPROM;
- }
- checksum += eeprom_data;
- }
- checksum = (u16) EEPROM_SUM - checksum;
- if (e1000_write_eeprom(hw, EEPROM_CHECKSUM_REG, 1, &checksum) < 0) {
- e_dbg("EEPROM Write Error\n");
- return -E1000_ERR_EEPROM;
- }
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_write_eeprom - write words to the different EEPROM types.
- * @hw: Struct containing variables accessed by shared code
- * @offset: offset within the EEPROM to be written to
- * @words: number of words to write
- * @data: 16 bit word to be written to the EEPROM
- *
- * If e1000_update_eeprom_checksum is not called after this function, the
- * EEPROM will most likely contain an invalid checksum.
- */
-s32 e1000_write_eeprom(struct e1000_hw *hw, u16 offset, u16 words, u16 *data)
-{
- s32 ret;
- spin_lock(&e1000_eeprom_lock);
- ret = e1000_do_write_eeprom(hw, offset, words, data);
- spin_unlock(&e1000_eeprom_lock);
- return ret;
-}
-
-static s32 e1000_do_write_eeprom(struct e1000_hw *hw, u16 offset, u16 words,
- u16 *data)
-{
- struct e1000_eeprom_info *eeprom = &hw->eeprom;
- s32 status = 0;
-
- e_dbg("e1000_write_eeprom");
-
- if (hw->mac_type == e1000_ce4100) {
- GBE_CONFIG_FLASH_WRITE(GBE_CONFIG_BASE_VIRT, offset, words,
- data);
- return E1000_SUCCESS;
- }
-
- /* If eeprom is not yet detected, do so now */
- if (eeprom->word_size == 0)
- e1000_init_eeprom_params(hw);
-
- /* A check for invalid values: offset too large, too many words, and not
- * enough words.
- */
- if ((offset >= eeprom->word_size)
- || (words > eeprom->word_size - offset) || (words == 0)) {
- e_dbg("\"words\" parameter out of bounds\n");
- return -E1000_ERR_EEPROM;
- }
-
- /* Prepare the EEPROM for writing */
- if (e1000_acquire_eeprom(hw) != E1000_SUCCESS)
- return -E1000_ERR_EEPROM;
-
- if (eeprom->type == e1000_eeprom_microwire) {
- status = e1000_write_eeprom_microwire(hw, offset, words, data);
- } else {
- status = e1000_write_eeprom_spi(hw, offset, words, data);
- msleep(10);
- }
-
- /* Done with writing */
- e1000_release_eeprom(hw);
-
- return status;
-}
-
-/**
- * e1000_write_eeprom_spi - Writes a 16 bit word to a given offset in an SPI EEPROM.
- * @hw: Struct containing variables accessed by shared code
- * @offset: offset within the EEPROM to be written to
- * @words: number of words to write
- * @data: pointer to array of 8 bit words to be written to the EEPROM
- */
-static s32 e1000_write_eeprom_spi(struct e1000_hw *hw, u16 offset, u16 words,
- u16 *data)
-{
- struct e1000_eeprom_info *eeprom = &hw->eeprom;
- u16 widx = 0;
-
- e_dbg("e1000_write_eeprom_spi");
-
- while (widx < words) {
- u8 write_opcode = EEPROM_WRITE_OPCODE_SPI;
-
- if (e1000_spi_eeprom_ready(hw))
- return -E1000_ERR_EEPROM;
-
- e1000_standby_eeprom(hw);
-
- /* Send the WRITE ENABLE command (8 bit opcode ) */
- e1000_shift_out_ee_bits(hw, EEPROM_WREN_OPCODE_SPI,
- eeprom->opcode_bits);
-
- e1000_standby_eeprom(hw);
-
- /* Some SPI eeproms use the 8th address bit embedded in the opcode */
- if ((eeprom->address_bits == 8) && (offset >= 128))
- write_opcode |= EEPROM_A8_OPCODE_SPI;
-
- /* Send the Write command (8-bit opcode + addr) */
- e1000_shift_out_ee_bits(hw, write_opcode, eeprom->opcode_bits);
-
- e1000_shift_out_ee_bits(hw, (u16) ((offset + widx) * 2),
- eeprom->address_bits);
-
- /* Send the data */
-
- /* Loop to allow for up to whole page write (32 bytes) of eeprom */
- while (widx < words) {
- u16 word_out = data[widx];
- word_out = (word_out >> 8) | (word_out << 8);
- e1000_shift_out_ee_bits(hw, word_out, 16);
- widx++;
-
- /* Some larger eeprom sizes are capable of a 32-byte PAGE WRITE
- * operation, while the smaller eeproms are capable of an 8-byte
- * PAGE WRITE operation. Break the inner loop to pass new address
- */
- if ((((offset + widx) * 2) % eeprom->page_size) == 0) {
- e1000_standby_eeprom(hw);
- break;
- }
- }
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_write_eeprom_microwire - Writes a 16 bit word to a given offset in a Microwire EEPROM.
- * @hw: Struct containing variables accessed by shared code
- * @offset: offset within the EEPROM to be written to
- * @words: number of words to write
- * @data: pointer to array of 8 bit words to be written to the EEPROM
- */
-static s32 e1000_write_eeprom_microwire(struct e1000_hw *hw, u16 offset,
- u16 words, u16 *data)
-{
- struct e1000_eeprom_info *eeprom = &hw->eeprom;
- u32 eecd;
- u16 words_written = 0;
- u16 i = 0;
-
- e_dbg("e1000_write_eeprom_microwire");
-
- /* Send the write enable command to the EEPROM (3-bit opcode plus
- * 6/8-bit dummy address beginning with 11). It's less work to include
- * the 11 of the dummy address as part of the opcode than it is to shift
- * it over the correct number of bits for the address. This puts the
- * EEPROM into write/erase mode.
- */
- e1000_shift_out_ee_bits(hw, EEPROM_EWEN_OPCODE_MICROWIRE,
- (u16) (eeprom->opcode_bits + 2));
-
- e1000_shift_out_ee_bits(hw, 0, (u16) (eeprom->address_bits - 2));
-
- /* Prepare the EEPROM */
- e1000_standby_eeprom(hw);
-
- while (words_written < words) {
- /* Send the Write command (3-bit opcode + addr) */
- e1000_shift_out_ee_bits(hw, EEPROM_WRITE_OPCODE_MICROWIRE,
- eeprom->opcode_bits);
-
- e1000_shift_out_ee_bits(hw, (u16) (offset + words_written),
- eeprom->address_bits);
-
- /* Send the data */
- e1000_shift_out_ee_bits(hw, data[words_written], 16);
-
- /* Toggle the CS line. This in effect tells the EEPROM to execute
- * the previous command.
- */
- e1000_standby_eeprom(hw);
-
- /* Read DO repeatedly until it is high (equal to '1'). The EEPROM will
- * signal that the command has been completed by raising the DO signal.
- * If DO does not go high in 10 milliseconds, then error out.
- */
- for (i = 0; i < 200; i++) {
- eecd = er32(EECD);
- if (eecd & E1000_EECD_DO)
- break;
- udelay(50);
- }
- if (i == 200) {
- e_dbg("EEPROM Write did not complete\n");
- return -E1000_ERR_EEPROM;
- }
-
- /* Recover from write */
- e1000_standby_eeprom(hw);
-
- words_written++;
- }
-
- /* Send the write disable command to the EEPROM (3-bit opcode plus
- * 6/8-bit dummy address beginning with 10). It's less work to include
- * the 10 of the dummy address as part of the opcode than it is to shift
- * it over the correct number of bits for the address. This takes the
- * EEPROM out of write/erase mode.
- */
- e1000_shift_out_ee_bits(hw, EEPROM_EWDS_OPCODE_MICROWIRE,
- (u16) (eeprom->opcode_bits + 2));
-
- e1000_shift_out_ee_bits(hw, 0, (u16) (eeprom->address_bits - 2));
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_read_mac_addr - read the adapters MAC from eeprom
- * @hw: Struct containing variables accessed by shared code
- *
- * Reads the adapter's MAC address from the EEPROM and inverts the LSB for the
- * second function of dual function devices
- */
-s32 e1000_read_mac_addr(struct e1000_hw *hw)
-{
- u16 offset;
- u16 eeprom_data, i;
-
- e_dbg("e1000_read_mac_addr");
-
- for (i = 0; i < NODE_ADDRESS_SIZE; i += 2) {
- offset = i >> 1;
- if (e1000_read_eeprom(hw, offset, 1, &eeprom_data) < 0) {
- e_dbg("EEPROM Read Error\n");
- return -E1000_ERR_EEPROM;
- }
- hw->perm_mac_addr[i] = (u8) (eeprom_data & 0x00FF);
- hw->perm_mac_addr[i + 1] = (u8) (eeprom_data >> 8);
- }
-
- switch (hw->mac_type) {
- default:
- break;
- case e1000_82546:
- case e1000_82546_rev_3:
- if (er32(STATUS) & E1000_STATUS_FUNC_1)
- hw->perm_mac_addr[5] ^= 0x01;
- break;
- }
-
- for (i = 0; i < NODE_ADDRESS_SIZE; i++)
- hw->mac_addr[i] = hw->perm_mac_addr[i];
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_init_rx_addrs - Initializes receive address filters.
- * @hw: Struct containing variables accessed by shared code
- *
- * Places the MAC address in receive address register 0 and clears the rest
- * of the receive address registers. Clears the multicast table. Assumes
- * the receiver is in reset when the routine is called.
- */
-static void e1000_init_rx_addrs(struct e1000_hw *hw)
-{
- u32 i;
- u32 rar_num;
-
- e_dbg("e1000_init_rx_addrs");
-
- /* Setup the receive address. */
- e_dbg("Programming MAC Address into RAR[0]\n");
-
- e1000_rar_set(hw, hw->mac_addr, 0);
-
- rar_num = E1000_RAR_ENTRIES;
-
- /* Zero out the other 15 receive addresses. */
- e_dbg("Clearing RAR[1-15]\n");
- for (i = 1; i < rar_num; i++) {
- E1000_WRITE_REG_ARRAY(hw, RA, (i << 1), 0);
- E1000_WRITE_FLUSH();
- E1000_WRITE_REG_ARRAY(hw, RA, ((i << 1) + 1), 0);
- E1000_WRITE_FLUSH();
- }
-}
-
-/**
- * e1000_hash_mc_addr - Hashes an address to determine its location in the multicast table
- * @hw: Struct containing variables accessed by shared code
- * @mc_addr: the multicast address to hash
- */
-u32 e1000_hash_mc_addr(struct e1000_hw *hw, u8 *mc_addr)
-{
- u32 hash_value = 0;
-
- /* The portion of the address that is used for the hash table is
- * determined by the mc_filter_type setting.
- */
- switch (hw->mc_filter_type) {
- /* [0] [1] [2] [3] [4] [5]
- * 01 AA 00 12 34 56
- * LSB MSB
- */
- case 0:
- /* [47:36] i.e. 0x563 for above example address */
- hash_value = ((mc_addr[4] >> 4) | (((u16) mc_addr[5]) << 4));
- break;
- case 1:
- /* [46:35] i.e. 0xAC6 for above example address */
- hash_value = ((mc_addr[4] >> 3) | (((u16) mc_addr[5]) << 5));
- break;
- case 2:
- /* [45:34] i.e. 0x5D8 for above example address */
- hash_value = ((mc_addr[4] >> 2) | (((u16) mc_addr[5]) << 6));
- break;
- case 3:
- /* [43:32] i.e. 0x634 for above example address */
- hash_value = ((mc_addr[4]) | (((u16) mc_addr[5]) << 8));
- break;
- }
-
- hash_value &= 0xFFF;
- return hash_value;
-}
-
-/**
- * e1000_rar_set - Puts an ethernet address into a receive address register.
- * @hw: Struct containing variables accessed by shared code
- * @addr: Address to put into receive address register
- * @index: Receive address register to write
- */
-void e1000_rar_set(struct e1000_hw *hw, u8 *addr, u32 index)
-{
- u32 rar_low, rar_high;
-
- /* HW expects these in little endian so we reverse the byte order
- * from network order (big endian) to little endian
- */
- rar_low = ((u32) addr[0] | ((u32) addr[1] << 8) |
- ((u32) addr[2] << 16) | ((u32) addr[3] << 24));
- rar_high = ((u32) addr[4] | ((u32) addr[5] << 8));
-
- /* Disable Rx and flush all Rx frames before enabling RSS to avoid Rx
- * unit hang.
- *
- * Description:
- * If there are any Rx frames queued up or otherwise present in the HW
- * before RSS is enabled, and then we enable RSS, the HW Rx unit will
- * hang. To work around this issue, we have to disable receives and
- * flush out all Rx frames before we enable RSS. To do so, we modify we
- * redirect all Rx traffic to manageability and then reset the HW.
- * This flushes away Rx frames, and (since the redirections to
- * manageability persists across resets) keeps new ones from coming in
- * while we work. Then, we clear the Address Valid AV bit for all MAC
- * addresses and undo the re-direction to manageability.
- * Now, frames are coming in again, but the MAC won't accept them, so
- * far so good. We now proceed to initialize RSS (if necessary) and
- * configure the Rx unit. Last, we re-enable the AV bits and continue
- * on our merry way.
- */
- switch (hw->mac_type) {
- default:
- /* Indicate to hardware the Address is Valid. */
- rar_high |= E1000_RAH_AV;
- break;
- }
-
- E1000_WRITE_REG_ARRAY(hw, RA, (index << 1), rar_low);
- E1000_WRITE_FLUSH();
- E1000_WRITE_REG_ARRAY(hw, RA, ((index << 1) + 1), rar_high);
- E1000_WRITE_FLUSH();
-}
-
-/**
- * e1000_write_vfta - Writes a value to the specified offset in the VLAN filter table.
- * @hw: Struct containing variables accessed by shared code
- * @offset: Offset in VLAN filer table to write
- * @value: Value to write into VLAN filter table
- */
-void e1000_write_vfta(struct e1000_hw *hw, u32 offset, u32 value)
-{
- u32 temp;
-
- if ((hw->mac_type == e1000_82544) && ((offset & 0x1) == 1)) {
- temp = E1000_READ_REG_ARRAY(hw, VFTA, (offset - 1));
- E1000_WRITE_REG_ARRAY(hw, VFTA, offset, value);
- E1000_WRITE_FLUSH();
- E1000_WRITE_REG_ARRAY(hw, VFTA, (offset - 1), temp);
- E1000_WRITE_FLUSH();
- } else {
- E1000_WRITE_REG_ARRAY(hw, VFTA, offset, value);
- E1000_WRITE_FLUSH();
- }
-}
-
-/**
- * e1000_clear_vfta - Clears the VLAN filer table
- * @hw: Struct containing variables accessed by shared code
- */
-static void e1000_clear_vfta(struct e1000_hw *hw)
-{
- u32 offset;
- u32 vfta_value = 0;
- u32 vfta_offset = 0;
- u32 vfta_bit_in_reg = 0;
-
- for (offset = 0; offset < E1000_VLAN_FILTER_TBL_SIZE; offset++) {
- /* If the offset we want to clear is the same offset of the
- * manageability VLAN ID, then clear all bits except that of the
- * manageability unit */
- vfta_value = (offset == vfta_offset) ? vfta_bit_in_reg : 0;
- E1000_WRITE_REG_ARRAY(hw, VFTA, offset, vfta_value);
- E1000_WRITE_FLUSH();
- }
-}
-
-static s32 e1000_id_led_init(struct e1000_hw *hw)
-{
- u32 ledctl;
- const u32 ledctl_mask = 0x000000FF;
- const u32 ledctl_on = E1000_LEDCTL_MODE_LED_ON;
- const u32 ledctl_off = E1000_LEDCTL_MODE_LED_OFF;
- u16 eeprom_data, i, temp;
- const u16 led_mask = 0x0F;
-
- e_dbg("e1000_id_led_init");
-
- if (hw->mac_type < e1000_82540) {
- /* Nothing to do */
- return E1000_SUCCESS;
- }
-
- ledctl = er32(LEDCTL);
- hw->ledctl_default = ledctl;
- hw->ledctl_mode1 = hw->ledctl_default;
- hw->ledctl_mode2 = hw->ledctl_default;
-
- if (e1000_read_eeprom(hw, EEPROM_ID_LED_SETTINGS, 1, &eeprom_data) < 0) {
- e_dbg("EEPROM Read Error\n");
- return -E1000_ERR_EEPROM;
- }
-
- if ((eeprom_data == ID_LED_RESERVED_0000) ||
- (eeprom_data == ID_LED_RESERVED_FFFF)) {
- eeprom_data = ID_LED_DEFAULT;
- }
-
- for (i = 0; i < 4; i++) {
- temp = (eeprom_data >> (i << 2)) & led_mask;
- switch (temp) {
- case ID_LED_ON1_DEF2:
- case ID_LED_ON1_ON2:
- case ID_LED_ON1_OFF2:
- hw->ledctl_mode1 &= ~(ledctl_mask << (i << 3));
- hw->ledctl_mode1 |= ledctl_on << (i << 3);
- break;
- case ID_LED_OFF1_DEF2:
- case ID_LED_OFF1_ON2:
- case ID_LED_OFF1_OFF2:
- hw->ledctl_mode1 &= ~(ledctl_mask << (i << 3));
- hw->ledctl_mode1 |= ledctl_off << (i << 3);
- break;
- default:
- /* Do nothing */
- break;
- }
- switch (temp) {
- case ID_LED_DEF1_ON2:
- case ID_LED_ON1_ON2:
- case ID_LED_OFF1_ON2:
- hw->ledctl_mode2 &= ~(ledctl_mask << (i << 3));
- hw->ledctl_mode2 |= ledctl_on << (i << 3);
- break;
- case ID_LED_DEF1_OFF2:
- case ID_LED_ON1_OFF2:
- case ID_LED_OFF1_OFF2:
- hw->ledctl_mode2 &= ~(ledctl_mask << (i << 3));
- hw->ledctl_mode2 |= ledctl_off << (i << 3);
- break;
- default:
- /* Do nothing */
- break;
- }
- }
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_setup_led
- * @hw: Struct containing variables accessed by shared code
- *
- * Prepares SW controlable LED for use and saves the current state of the LED.
- */
-s32 e1000_setup_led(struct e1000_hw *hw)
-{
- u32 ledctl;
- s32 ret_val = E1000_SUCCESS;
-
- e_dbg("e1000_setup_led");
-
- switch (hw->mac_type) {
- case e1000_82542_rev2_0:
- case e1000_82542_rev2_1:
- case e1000_82543:
- case e1000_82544:
- /* No setup necessary */
- break;
- case e1000_82541:
- case e1000_82547:
- case e1000_82541_rev_2:
- case e1000_82547_rev_2:
- /* Turn off PHY Smart Power Down (if enabled) */
- ret_val = e1000_read_phy_reg(hw, IGP01E1000_GMII_FIFO,
- &hw->phy_spd_default);
- if (ret_val)
- return ret_val;
- ret_val = e1000_write_phy_reg(hw, IGP01E1000_GMII_FIFO,
- (u16) (hw->phy_spd_default &
- ~IGP01E1000_GMII_SPD));
- if (ret_val)
- return ret_val;
- /* Fall Through */
- default:
- if (hw->media_type == e1000_media_type_fiber) {
- ledctl = er32(LEDCTL);
- /* Save current LEDCTL settings */
- hw->ledctl_default = ledctl;
- /* Turn off LED0 */
- ledctl &= ~(E1000_LEDCTL_LED0_IVRT |
- E1000_LEDCTL_LED0_BLINK |
- E1000_LEDCTL_LED0_MODE_MASK);
- ledctl |= (E1000_LEDCTL_MODE_LED_OFF <<
- E1000_LEDCTL_LED0_MODE_SHIFT);
- ew32(LEDCTL, ledctl);
- } else if (hw->media_type == e1000_media_type_copper)
- ew32(LEDCTL, hw->ledctl_mode1);
- break;
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_cleanup_led - Restores the saved state of the SW controlable LED.
- * @hw: Struct containing variables accessed by shared code
- */
-s32 e1000_cleanup_led(struct e1000_hw *hw)
-{
- s32 ret_val = E1000_SUCCESS;
-
- e_dbg("e1000_cleanup_led");
-
- switch (hw->mac_type) {
- case e1000_82542_rev2_0:
- case e1000_82542_rev2_1:
- case e1000_82543:
- case e1000_82544:
- /* No cleanup necessary */
- break;
- case e1000_82541:
- case e1000_82547:
- case e1000_82541_rev_2:
- case e1000_82547_rev_2:
- /* Turn on PHY Smart Power Down (if previously enabled) */
- ret_val = e1000_write_phy_reg(hw, IGP01E1000_GMII_FIFO,
- hw->phy_spd_default);
- if (ret_val)
- return ret_val;
- /* Fall Through */
- default:
- /* Restore LEDCTL settings */
- ew32(LEDCTL, hw->ledctl_default);
- break;
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_led_on - Turns on the software controllable LED
- * @hw: Struct containing variables accessed by shared code
- */
-s32 e1000_led_on(struct e1000_hw *hw)
-{
- u32 ctrl = er32(CTRL);
-
- e_dbg("e1000_led_on");
-
- switch (hw->mac_type) {
- case e1000_82542_rev2_0:
- case e1000_82542_rev2_1:
- case e1000_82543:
- /* Set SW Defineable Pin 0 to turn on the LED */
- ctrl |= E1000_CTRL_SWDPIN0;
- ctrl |= E1000_CTRL_SWDPIO0;
- break;
- case e1000_82544:
- if (hw->media_type == e1000_media_type_fiber) {
- /* Set SW Defineable Pin 0 to turn on the LED */
- ctrl |= E1000_CTRL_SWDPIN0;
- ctrl |= E1000_CTRL_SWDPIO0;
- } else {
- /* Clear SW Defineable Pin 0 to turn on the LED */
- ctrl &= ~E1000_CTRL_SWDPIN0;
- ctrl |= E1000_CTRL_SWDPIO0;
- }
- break;
- default:
- if (hw->media_type == e1000_media_type_fiber) {
- /* Clear SW Defineable Pin 0 to turn on the LED */
- ctrl &= ~E1000_CTRL_SWDPIN0;
- ctrl |= E1000_CTRL_SWDPIO0;
- } else if (hw->media_type == e1000_media_type_copper) {
- ew32(LEDCTL, hw->ledctl_mode2);
- return E1000_SUCCESS;
- }
- break;
- }
-
- ew32(CTRL, ctrl);
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_led_off - Turns off the software controllable LED
- * @hw: Struct containing variables accessed by shared code
- */
-s32 e1000_led_off(struct e1000_hw *hw)
-{
- u32 ctrl = er32(CTRL);
-
- e_dbg("e1000_led_off");
-
- switch (hw->mac_type) {
- case e1000_82542_rev2_0:
- case e1000_82542_rev2_1:
- case e1000_82543:
- /* Clear SW Defineable Pin 0 to turn off the LED */
- ctrl &= ~E1000_CTRL_SWDPIN0;
- ctrl |= E1000_CTRL_SWDPIO0;
- break;
- case e1000_82544:
- if (hw->media_type == e1000_media_type_fiber) {
- /* Clear SW Defineable Pin 0 to turn off the LED */
- ctrl &= ~E1000_CTRL_SWDPIN0;
- ctrl |= E1000_CTRL_SWDPIO0;
- } else {
- /* Set SW Defineable Pin 0 to turn off the LED */
- ctrl |= E1000_CTRL_SWDPIN0;
- ctrl |= E1000_CTRL_SWDPIO0;
- }
- break;
- default:
- if (hw->media_type == e1000_media_type_fiber) {
- /* Set SW Defineable Pin 0 to turn off the LED */
- ctrl |= E1000_CTRL_SWDPIN0;
- ctrl |= E1000_CTRL_SWDPIO0;
- } else if (hw->media_type == e1000_media_type_copper) {
- ew32(LEDCTL, hw->ledctl_mode1);
- return E1000_SUCCESS;
- }
- break;
- }
-
- ew32(CTRL, ctrl);
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_clear_hw_cntrs - Clears all hardware statistics counters.
- * @hw: Struct containing variables accessed by shared code
- */
-static void e1000_clear_hw_cntrs(struct e1000_hw *hw)
-{
- volatile u32 temp;
-
- temp = er32(CRCERRS);
- temp = er32(SYMERRS);
- temp = er32(MPC);
- temp = er32(SCC);
- temp = er32(ECOL);
- temp = er32(MCC);
- temp = er32(LATECOL);
- temp = er32(COLC);
- temp = er32(DC);
- temp = er32(SEC);
- temp = er32(RLEC);
- temp = er32(XONRXC);
- temp = er32(XONTXC);
- temp = er32(XOFFRXC);
- temp = er32(XOFFTXC);
- temp = er32(FCRUC);
-
- temp = er32(PRC64);
- temp = er32(PRC127);
- temp = er32(PRC255);
- temp = er32(PRC511);
- temp = er32(PRC1023);
- temp = er32(PRC1522);
-
- temp = er32(GPRC);
- temp = er32(BPRC);
- temp = er32(MPRC);
- temp = er32(GPTC);
- temp = er32(GORCL);
- temp = er32(GORCH);
- temp = er32(GOTCL);
- temp = er32(GOTCH);
- temp = er32(RNBC);
- temp = er32(RUC);
- temp = er32(RFC);
- temp = er32(ROC);
- temp = er32(RJC);
- temp = er32(TORL);
- temp = er32(TORH);
- temp = er32(TOTL);
- temp = er32(TOTH);
- temp = er32(TPR);
- temp = er32(TPT);
-
- temp = er32(PTC64);
- temp = er32(PTC127);
- temp = er32(PTC255);
- temp = er32(PTC511);
- temp = er32(PTC1023);
- temp = er32(PTC1522);
-
- temp = er32(MPTC);
- temp = er32(BPTC);
-
- if (hw->mac_type < e1000_82543)
- return;
-
- temp = er32(ALGNERRC);
- temp = er32(RXERRC);
- temp = er32(TNCRS);
- temp = er32(CEXTERR);
- temp = er32(TSCTC);
- temp = er32(TSCTFC);
-
- if (hw->mac_type <= e1000_82544)
- return;
-
- temp = er32(MGTPRC);
- temp = er32(MGTPDC);
- temp = er32(MGTPTC);
-}
-
-/**
- * e1000_reset_adaptive - Resets Adaptive IFS to its default state.
- * @hw: Struct containing variables accessed by shared code
- *
- * Call this after e1000_init_hw. You may override the IFS defaults by setting
- * hw->ifs_params_forced to true. However, you must initialize hw->
- * current_ifs_val, ifs_min_val, ifs_max_val, ifs_step_size, and ifs_ratio
- * before calling this function.
- */
-void e1000_reset_adaptive(struct e1000_hw *hw)
-{
- e_dbg("e1000_reset_adaptive");
-
- if (hw->adaptive_ifs) {
- if (!hw->ifs_params_forced) {
- hw->current_ifs_val = 0;
- hw->ifs_min_val = IFS_MIN;
- hw->ifs_max_val = IFS_MAX;
- hw->ifs_step_size = IFS_STEP;
- hw->ifs_ratio = IFS_RATIO;
- }
- hw->in_ifs_mode = false;
- ew32(AIT, 0);
- } else {
- e_dbg("Not in Adaptive IFS mode!\n");
- }
-}
-
-/**
- * e1000_update_adaptive - update adaptive IFS
- * @hw: Struct containing variables accessed by shared code
- * @tx_packets: Number of transmits since last callback
- * @total_collisions: Number of collisions since last callback
- *
- * Called during the callback/watchdog routine to update IFS value based on
- * the ratio of transmits to collisions.
- */
-void e1000_update_adaptive(struct e1000_hw *hw)
-{
- e_dbg("e1000_update_adaptive");
-
- if (hw->adaptive_ifs) {
- if ((hw->collision_delta *hw->ifs_ratio) > hw->tx_packet_delta) {
- if (hw->tx_packet_delta > MIN_NUM_XMITS) {
- hw->in_ifs_mode = true;
- if (hw->current_ifs_val < hw->ifs_max_val) {
- if (hw->current_ifs_val == 0)
- hw->current_ifs_val =
- hw->ifs_min_val;
- else
- hw->current_ifs_val +=
- hw->ifs_step_size;
- ew32(AIT, hw->current_ifs_val);
- }
- }
- } else {
- if (hw->in_ifs_mode
- && (hw->tx_packet_delta <= MIN_NUM_XMITS)) {
- hw->current_ifs_val = 0;
- hw->in_ifs_mode = false;
- ew32(AIT, 0);
- }
- }
- } else {
- e_dbg("Not in Adaptive IFS mode!\n");
- }
-}
-
-/**
- * e1000_tbi_adjust_stats
- * @hw: Struct containing variables accessed by shared code
- * @frame_len: The length of the frame in question
- * @mac_addr: The Ethernet destination address of the frame in question
- *
- * Adjusts the statistic counters when a frame is accepted by TBI_ACCEPT
- */
-void e1000_tbi_adjust_stats(struct e1000_hw *hw, struct e1000_hw_stats *stats,
- u32 frame_len, u8 *mac_addr)
-{
- u64 carry_bit;
-
- /* First adjust the frame length. */
- frame_len--;
- /* We need to adjust the statistics counters, since the hardware
- * counters overcount this packet as a CRC error and undercount
- * the packet as a good packet
- */
- /* This packet should not be counted as a CRC error. */
- stats->crcerrs--;
- /* This packet does count as a Good Packet Received. */
- stats->gprc++;
-
- /* Adjust the Good Octets received counters */
- carry_bit = 0x80000000 & stats->gorcl;
- stats->gorcl += frame_len;
- /* If the high bit of Gorcl (the low 32 bits of the Good Octets
- * Received Count) was one before the addition,
- * AND it is zero after, then we lost the carry out,
- * need to add one to Gorch (Good Octets Received Count High).
- * This could be simplified if all environments supported
- * 64-bit integers.
- */
- if (carry_bit && ((stats->gorcl & 0x80000000) == 0))
- stats->gorch++;
- /* Is this a broadcast or multicast? Check broadcast first,
- * since the test for a multicast frame will test positive on
- * a broadcast frame.
- */
- if ((mac_addr[0] == (u8) 0xff) && (mac_addr[1] == (u8) 0xff))
- /* Broadcast packet */
- stats->bprc++;
- else if (*mac_addr & 0x01)
- /* Multicast packet */
- stats->mprc++;
-
- if (frame_len == hw->max_frame_size) {
- /* In this case, the hardware has overcounted the number of
- * oversize frames.
- */
- if (stats->roc > 0)
- stats->roc--;
- }
-
- /* Adjust the bin counters when the extra byte put the frame in the
- * wrong bin. Remember that the frame_len was adjusted above.
- */
- if (frame_len == 64) {
- stats->prc64++;
- stats->prc127--;
- } else if (frame_len == 127) {
- stats->prc127++;
- stats->prc255--;
- } else if (frame_len == 255) {
- stats->prc255++;
- stats->prc511--;
- } else if (frame_len == 511) {
- stats->prc511++;
- stats->prc1023--;
- } else if (frame_len == 1023) {
- stats->prc1023++;
- stats->prc1522--;
- } else if (frame_len == 1522) {
- stats->prc1522++;
- }
-}
-
-/**
- * e1000_get_bus_info
- * @hw: Struct containing variables accessed by shared code
- *
- * Gets the current PCI bus type, speed, and width of the hardware
- */
-void e1000_get_bus_info(struct e1000_hw *hw)
-{
- u32 status;
-
- switch (hw->mac_type) {
- case e1000_82542_rev2_0:
- case e1000_82542_rev2_1:
- hw->bus_type = e1000_bus_type_pci;
- hw->bus_speed = e1000_bus_speed_unknown;
- hw->bus_width = e1000_bus_width_unknown;
- break;
- default:
- status = er32(STATUS);
- hw->bus_type = (status & E1000_STATUS_PCIX_MODE) ?
- e1000_bus_type_pcix : e1000_bus_type_pci;
-
- if (hw->device_id == E1000_DEV_ID_82546EB_QUAD_COPPER) {
- hw->bus_speed = (hw->bus_type == e1000_bus_type_pci) ?
- e1000_bus_speed_66 : e1000_bus_speed_120;
- } else if (hw->bus_type == e1000_bus_type_pci) {
- hw->bus_speed = (status & E1000_STATUS_PCI66) ?
- e1000_bus_speed_66 : e1000_bus_speed_33;
- } else {
- switch (status & E1000_STATUS_PCIX_SPEED) {
- case E1000_STATUS_PCIX_SPEED_66:
- hw->bus_speed = e1000_bus_speed_66;
- break;
- case E1000_STATUS_PCIX_SPEED_100:
- hw->bus_speed = e1000_bus_speed_100;
- break;
- case E1000_STATUS_PCIX_SPEED_133:
- hw->bus_speed = e1000_bus_speed_133;
- break;
- default:
- hw->bus_speed = e1000_bus_speed_reserved;
- break;
- }
- }
- hw->bus_width = (status & E1000_STATUS_BUS64) ?
- e1000_bus_width_64 : e1000_bus_width_32;
- break;
- }
-}
-
-/**
- * e1000_write_reg_io
- * @hw: Struct containing variables accessed by shared code
- * @offset: offset to write to
- * @value: value to write
- *
- * Writes a value to one of the devices registers using port I/O (as opposed to
- * memory mapped I/O). Only 82544 and newer devices support port I/O.
- */
-static void e1000_write_reg_io(struct e1000_hw *hw, u32 offset, u32 value)
-{
- unsigned long io_addr = hw->io_base;
- unsigned long io_data = hw->io_base + 4;
-
- e1000_io_write(hw, io_addr, offset);
- e1000_io_write(hw, io_data, value);
-}
-
-/**
- * e1000_get_cable_length - Estimates the cable length.
- * @hw: Struct containing variables accessed by shared code
- * @min_length: The estimated minimum length
- * @max_length: The estimated maximum length
- *
- * returns: - E1000_ERR_XXX
- * E1000_SUCCESS
- *
- * This function always returns a ranged length (minimum & maximum).
- * So for M88 phy's, this function interprets the one value returned from the
- * register to the minimum and maximum range.
- * For IGP phy's, the function calculates the range by the AGC registers.
- */
-static s32 e1000_get_cable_length(struct e1000_hw *hw, u16 *min_length,
- u16 *max_length)
-{
- s32 ret_val;
- u16 agc_value = 0;
- u16 i, phy_data;
- u16 cable_length;
-
- e_dbg("e1000_get_cable_length");
-
- *min_length = *max_length = 0;
-
- /* Use old method for Phy older than IGP */
- if (hw->phy_type == e1000_phy_m88) {
-
- ret_val = e1000_read_phy_reg(hw, M88E1000_PHY_SPEC_STATUS,
- &phy_data);
- if (ret_val)
- return ret_val;
- cable_length = (phy_data & M88E1000_PSSR_CABLE_LENGTH) >>
- M88E1000_PSSR_CABLE_LENGTH_SHIFT;
-
- /* Convert the enum value to ranged values */
- switch (cable_length) {
- case e1000_cable_length_50:
- *min_length = 0;
- *max_length = e1000_igp_cable_length_50;
- break;
- case e1000_cable_length_50_80:
- *min_length = e1000_igp_cable_length_50;
- *max_length = e1000_igp_cable_length_80;
- break;
- case e1000_cable_length_80_110:
- *min_length = e1000_igp_cable_length_80;
- *max_length = e1000_igp_cable_length_110;
- break;
- case e1000_cable_length_110_140:
- *min_length = e1000_igp_cable_length_110;
- *max_length = e1000_igp_cable_length_140;
- break;
- case e1000_cable_length_140:
- *min_length = e1000_igp_cable_length_140;
- *max_length = e1000_igp_cable_length_170;
- break;
- default:
- return -E1000_ERR_PHY;
- break;
- }
- } else if (hw->phy_type == e1000_phy_igp) { /* For IGP PHY */
- u16 cur_agc_value;
- u16 min_agc_value = IGP01E1000_AGC_LENGTH_TABLE_SIZE;
- static const u16 agc_reg_array[IGP01E1000_PHY_CHANNEL_NUM] = {
- IGP01E1000_PHY_AGC_A,
- IGP01E1000_PHY_AGC_B,
- IGP01E1000_PHY_AGC_C,
- IGP01E1000_PHY_AGC_D
- };
- /* Read the AGC registers for all channels */
- for (i = 0; i < IGP01E1000_PHY_CHANNEL_NUM; i++) {
-
- ret_val =
- e1000_read_phy_reg(hw, agc_reg_array[i], &phy_data);
- if (ret_val)
- return ret_val;
-
- cur_agc_value = phy_data >> IGP01E1000_AGC_LENGTH_SHIFT;
-
- /* Value bound check. */
- if ((cur_agc_value >=
- IGP01E1000_AGC_LENGTH_TABLE_SIZE - 1)
- || (cur_agc_value == 0))
- return -E1000_ERR_PHY;
-
- agc_value += cur_agc_value;
-
- /* Update minimal AGC value. */
- if (min_agc_value > cur_agc_value)
- min_agc_value = cur_agc_value;
- }
-
- /* Remove the minimal AGC result for length < 50m */
- if (agc_value <
- IGP01E1000_PHY_CHANNEL_NUM * e1000_igp_cable_length_50) {
- agc_value -= min_agc_value;
-
- /* Get the average length of the remaining 3 channels */
- agc_value /= (IGP01E1000_PHY_CHANNEL_NUM - 1);
- } else {
- /* Get the average length of all the 4 channels. */
- agc_value /= IGP01E1000_PHY_CHANNEL_NUM;
- }
-
- /* Set the range of the calculated length. */
- *min_length = ((e1000_igp_cable_length_table[agc_value] -
- IGP01E1000_AGC_RANGE) > 0) ?
- (e1000_igp_cable_length_table[agc_value] -
- IGP01E1000_AGC_RANGE) : 0;
- *max_length = e1000_igp_cable_length_table[agc_value] +
- IGP01E1000_AGC_RANGE;
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_check_polarity - Check the cable polarity
- * @hw: Struct containing variables accessed by shared code
- * @polarity: output parameter : 0 - Polarity is not reversed
- * 1 - Polarity is reversed.
- *
- * returns: - E1000_ERR_XXX
- * E1000_SUCCESS
- *
- * For phy's older than IGP, this function simply reads the polarity bit in the
- * Phy Status register. For IGP phy's, this bit is valid only if link speed is
- * 10 Mbps. If the link speed is 100 Mbps there is no polarity so this bit will
- * return 0. If the link speed is 1000 Mbps the polarity status is in the
- * IGP01E1000_PHY_PCS_INIT_REG.
- */
-static s32 e1000_check_polarity(struct e1000_hw *hw,
- e1000_rev_polarity *polarity)
-{
- s32 ret_val;
- u16 phy_data;
-
- e_dbg("e1000_check_polarity");
-
- if (hw->phy_type == e1000_phy_m88) {
- /* return the Polarity bit in the Status register. */
- ret_val = e1000_read_phy_reg(hw, M88E1000_PHY_SPEC_STATUS,
- &phy_data);
- if (ret_val)
- return ret_val;
- *polarity = ((phy_data & M88E1000_PSSR_REV_POLARITY) >>
- M88E1000_PSSR_REV_POLARITY_SHIFT) ?
- e1000_rev_polarity_reversed : e1000_rev_polarity_normal;
-
- } else if (hw->phy_type == e1000_phy_igp) {
- /* Read the Status register to check the speed */
- ret_val = e1000_read_phy_reg(hw, IGP01E1000_PHY_PORT_STATUS,
- &phy_data);
- if (ret_val)
- return ret_val;
-
- /* If speed is 1000 Mbps, must read the IGP01E1000_PHY_PCS_INIT_REG to
- * find the polarity status */
- if ((phy_data & IGP01E1000_PSSR_SPEED_MASK) ==
- IGP01E1000_PSSR_SPEED_1000MBPS) {
-
- /* Read the GIG initialization PCS register (0x00B4) */
- ret_val =
- e1000_read_phy_reg(hw, IGP01E1000_PHY_PCS_INIT_REG,
- &phy_data);
- if (ret_val)
- return ret_val;
-
- /* Check the polarity bits */
- *polarity = (phy_data & IGP01E1000_PHY_POLARITY_MASK) ?
- e1000_rev_polarity_reversed :
- e1000_rev_polarity_normal;
- } else {
- /* For 10 Mbps, read the polarity bit in the status register. (for
- * 100 Mbps this bit is always 0) */
- *polarity =
- (phy_data & IGP01E1000_PSSR_POLARITY_REVERSED) ?
- e1000_rev_polarity_reversed :
- e1000_rev_polarity_normal;
- }
- }
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_check_downshift - Check if Downshift occurred
- * @hw: Struct containing variables accessed by shared code
- * @downshift: output parameter : 0 - No Downshift occurred.
- * 1 - Downshift occurred.
- *
- * returns: - E1000_ERR_XXX
- * E1000_SUCCESS
- *
- * For phy's older than IGP, this function reads the Downshift bit in the Phy
- * Specific Status register. For IGP phy's, it reads the Downgrade bit in the
- * Link Health register. In IGP this bit is latched high, so the driver must
- * read it immediately after link is established.
- */
-static s32 e1000_check_downshift(struct e1000_hw *hw)
-{
- s32 ret_val;
- u16 phy_data;
-
- e_dbg("e1000_check_downshift");
-
- if (hw->phy_type == e1000_phy_igp) {
- ret_val = e1000_read_phy_reg(hw, IGP01E1000_PHY_LINK_HEALTH,
- &phy_data);
- if (ret_val)
- return ret_val;
-
- hw->speed_downgraded =
- (phy_data & IGP01E1000_PLHR_SS_DOWNGRADE) ? 1 : 0;
- } else if (hw->phy_type == e1000_phy_m88) {
- ret_val = e1000_read_phy_reg(hw, M88E1000_PHY_SPEC_STATUS,
- &phy_data);
- if (ret_val)
- return ret_val;
-
- hw->speed_downgraded = (phy_data & M88E1000_PSSR_DOWNSHIFT) >>
- M88E1000_PSSR_DOWNSHIFT_SHIFT;
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_config_dsp_after_link_change
- * @hw: Struct containing variables accessed by shared code
- * @link_up: was link up at the time this was called
- *
- * returns: - E1000_ERR_PHY if fail to read/write the PHY
- * E1000_SUCCESS at any other case.
- *
- * 82541_rev_2 & 82547_rev_2 have the capability to configure the DSP when a
- * gigabit link is achieved to improve link quality.
- */
-
-static s32 e1000_config_dsp_after_link_change(struct e1000_hw *hw, bool link_up)
-{
- s32 ret_val;
- u16 phy_data, phy_saved_data, speed, duplex, i;
- static const u16 dsp_reg_array[IGP01E1000_PHY_CHANNEL_NUM] = {
- IGP01E1000_PHY_AGC_PARAM_A,
- IGP01E1000_PHY_AGC_PARAM_B,
- IGP01E1000_PHY_AGC_PARAM_C,
- IGP01E1000_PHY_AGC_PARAM_D
- };
- u16 min_length, max_length;
-
- e_dbg("e1000_config_dsp_after_link_change");
-
- if (hw->phy_type != e1000_phy_igp)
- return E1000_SUCCESS;
-
- if (link_up) {
- ret_val = e1000_get_speed_and_duplex(hw, &speed, &duplex);
- if (ret_val) {
- e_dbg("Error getting link speed and duplex\n");
- return ret_val;
- }
-
- if (speed == SPEED_1000) {
-
- ret_val =
- e1000_get_cable_length(hw, &min_length,
- &max_length);
- if (ret_val)
- return ret_val;
-
- if ((hw->dsp_config_state == e1000_dsp_config_enabled)
- && min_length >= e1000_igp_cable_length_50) {
-
- for (i = 0; i < IGP01E1000_PHY_CHANNEL_NUM; i++) {
- ret_val =
- e1000_read_phy_reg(hw,
- dsp_reg_array[i],
- &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_data &=
- ~IGP01E1000_PHY_EDAC_MU_INDEX;
-
- ret_val =
- e1000_write_phy_reg(hw,
- dsp_reg_array
- [i], phy_data);
- if (ret_val)
- return ret_val;
- }
- hw->dsp_config_state =
- e1000_dsp_config_activated;
- }
-
- if ((hw->ffe_config_state == e1000_ffe_config_enabled)
- && (min_length < e1000_igp_cable_length_50)) {
-
- u16 ffe_idle_err_timeout =
- FFE_IDLE_ERR_COUNT_TIMEOUT_20;
- u32 idle_errs = 0;
-
- /* clear previous idle error counts */
- ret_val =
- e1000_read_phy_reg(hw, PHY_1000T_STATUS,
- &phy_data);
- if (ret_val)
- return ret_val;
-
- for (i = 0; i < ffe_idle_err_timeout; i++) {
- udelay(1000);
- ret_val =
- e1000_read_phy_reg(hw,
- PHY_1000T_STATUS,
- &phy_data);
- if (ret_val)
- return ret_val;
-
- idle_errs +=
- (phy_data &
- SR_1000T_IDLE_ERROR_CNT);
- if (idle_errs >
- SR_1000T_PHY_EXCESSIVE_IDLE_ERR_COUNT)
- {
- hw->ffe_config_state =
- e1000_ffe_config_active;
-
- ret_val =
- e1000_write_phy_reg(hw,
- IGP01E1000_PHY_DSP_FFE,
- IGP01E1000_PHY_DSP_FFE_CM_CP);
- if (ret_val)
- return ret_val;
- break;
- }
-
- if (idle_errs)
- ffe_idle_err_timeout =
- FFE_IDLE_ERR_COUNT_TIMEOUT_100;
- }
- }
- }
- } else {
- if (hw->dsp_config_state == e1000_dsp_config_activated) {
- /* Save off the current value of register 0x2F5B to be restored at
- * the end of the routines. */
- ret_val =
- e1000_read_phy_reg(hw, 0x2F5B, &phy_saved_data);
-
- if (ret_val)
- return ret_val;
-
- /* Disable the PHY transmitter */
- ret_val = e1000_write_phy_reg(hw, 0x2F5B, 0x0003);
-
- if (ret_val)
- return ret_val;
-
- mdelay(20);
-
- ret_val = e1000_write_phy_reg(hw, 0x0000,
- IGP01E1000_IEEE_FORCE_GIGA);
- if (ret_val)
- return ret_val;
- for (i = 0; i < IGP01E1000_PHY_CHANNEL_NUM; i++) {
- ret_val =
- e1000_read_phy_reg(hw, dsp_reg_array[i],
- &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_data &= ~IGP01E1000_PHY_EDAC_MU_INDEX;
- phy_data |= IGP01E1000_PHY_EDAC_SIGN_EXT_9_BITS;
-
- ret_val =
- e1000_write_phy_reg(hw, dsp_reg_array[i],
- phy_data);
- if (ret_val)
- return ret_val;
- }
-
- ret_val = e1000_write_phy_reg(hw, 0x0000,
- IGP01E1000_IEEE_RESTART_AUTONEG);
- if (ret_val)
- return ret_val;
-
- mdelay(20);
-
- /* Now enable the transmitter */
- ret_val =
- e1000_write_phy_reg(hw, 0x2F5B, phy_saved_data);
-
- if (ret_val)
- return ret_val;
-
- hw->dsp_config_state = e1000_dsp_config_enabled;
- }
-
- if (hw->ffe_config_state == e1000_ffe_config_active) {
- /* Save off the current value of register 0x2F5B to be restored at
- * the end of the routines. */
- ret_val =
- e1000_read_phy_reg(hw, 0x2F5B, &phy_saved_data);
-
- if (ret_val)
- return ret_val;
-
- /* Disable the PHY transmitter */
- ret_val = e1000_write_phy_reg(hw, 0x2F5B, 0x0003);
-
- if (ret_val)
- return ret_val;
-
- mdelay(20);
-
- ret_val = e1000_write_phy_reg(hw, 0x0000,
- IGP01E1000_IEEE_FORCE_GIGA);
- if (ret_val)
- return ret_val;
- ret_val =
- e1000_write_phy_reg(hw, IGP01E1000_PHY_DSP_FFE,
- IGP01E1000_PHY_DSP_FFE_DEFAULT);
- if (ret_val)
- return ret_val;
-
- ret_val = e1000_write_phy_reg(hw, 0x0000,
- IGP01E1000_IEEE_RESTART_AUTONEG);
- if (ret_val)
- return ret_val;
-
- mdelay(20);
-
- /* Now enable the transmitter */
- ret_val =
- e1000_write_phy_reg(hw, 0x2F5B, phy_saved_data);
-
- if (ret_val)
- return ret_val;
-
- hw->ffe_config_state = e1000_ffe_config_enabled;
- }
- }
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_set_phy_mode - Set PHY to class A mode
- * @hw: Struct containing variables accessed by shared code
- *
- * Assumes the following operations will follow to enable the new class mode.
- * 1. Do a PHY soft reset
- * 2. Restart auto-negotiation or force link.
- */
-static s32 e1000_set_phy_mode(struct e1000_hw *hw)
-{
- s32 ret_val;
- u16 eeprom_data;
-
- e_dbg("e1000_set_phy_mode");
-
- if ((hw->mac_type == e1000_82545_rev_3) &&
- (hw->media_type == e1000_media_type_copper)) {
- ret_val =
- e1000_read_eeprom(hw, EEPROM_PHY_CLASS_WORD, 1,
- &eeprom_data);
- if (ret_val) {
- return ret_val;
- }
-
- if ((eeprom_data != EEPROM_RESERVED_WORD) &&
- (eeprom_data & EEPROM_PHY_CLASS_A)) {
- ret_val =
- e1000_write_phy_reg(hw, M88E1000_PHY_PAGE_SELECT,
- 0x000B);
- if (ret_val)
- return ret_val;
- ret_val =
- e1000_write_phy_reg(hw, M88E1000_PHY_GEN_CONTROL,
- 0x8104);
- if (ret_val)
- return ret_val;
-
- hw->phy_reset_disable = false;
- }
- }
-
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_set_d3_lplu_state - set d3 link power state
- * @hw: Struct containing variables accessed by shared code
- * @active: true to enable lplu false to disable lplu.
- *
- * This function sets the lplu state according to the active flag. When
- * activating lplu this function also disables smart speed and vise versa.
- * lplu will not be activated unless the device autonegotiation advertisement
- * meets standards of either 10 or 10/100 or 10/100/1000 at all duplexes.
- *
- * returns: - E1000_ERR_PHY if fail to read/write the PHY
- * E1000_SUCCESS at any other case.
- */
-static s32 e1000_set_d3_lplu_state(struct e1000_hw *hw, bool active)
-{
- s32 ret_val;
- u16 phy_data;
- e_dbg("e1000_set_d3_lplu_state");
-
- if (hw->phy_type != e1000_phy_igp)
- return E1000_SUCCESS;
-
- /* During driver activity LPLU should not be used or it will attain link
- * from the lowest speeds starting from 10Mbps. The capability is used for
- * Dx transitions and states */
- if (hw->mac_type == e1000_82541_rev_2
- || hw->mac_type == e1000_82547_rev_2) {
- ret_val =
- e1000_read_phy_reg(hw, IGP01E1000_GMII_FIFO, &phy_data);
- if (ret_val)
- return ret_val;
- }
-
- if (!active) {
- if (hw->mac_type == e1000_82541_rev_2 ||
- hw->mac_type == e1000_82547_rev_2) {
- phy_data &= ~IGP01E1000_GMII_FLEX_SPD;
- ret_val =
- e1000_write_phy_reg(hw, IGP01E1000_GMII_FIFO,
- phy_data);
- if (ret_val)
- return ret_val;
- }
-
- /* LPLU and SmartSpeed are mutually exclusive. LPLU is used during
- * Dx states where the power conservation is most important. During
- * driver activity we should enable SmartSpeed, so performance is
- * maintained. */
- if (hw->smart_speed == e1000_smart_speed_on) {
- ret_val =
- e1000_read_phy_reg(hw, IGP01E1000_PHY_PORT_CONFIG,
- &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_data |= IGP01E1000_PSCFR_SMART_SPEED;
- ret_val =
- e1000_write_phy_reg(hw, IGP01E1000_PHY_PORT_CONFIG,
- phy_data);
- if (ret_val)
- return ret_val;
- } else if (hw->smart_speed == e1000_smart_speed_off) {
- ret_val =
- e1000_read_phy_reg(hw, IGP01E1000_PHY_PORT_CONFIG,
- &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_data &= ~IGP01E1000_PSCFR_SMART_SPEED;
- ret_val =
- e1000_write_phy_reg(hw, IGP01E1000_PHY_PORT_CONFIG,
- phy_data);
- if (ret_val)
- return ret_val;
- }
- } else if ((hw->autoneg_advertised == AUTONEG_ADVERTISE_SPEED_DEFAULT)
- || (hw->autoneg_advertised == AUTONEG_ADVERTISE_10_ALL)
- || (hw->autoneg_advertised ==
- AUTONEG_ADVERTISE_10_100_ALL)) {
-
- if (hw->mac_type == e1000_82541_rev_2 ||
- hw->mac_type == e1000_82547_rev_2) {
- phy_data |= IGP01E1000_GMII_FLEX_SPD;
- ret_val =
- e1000_write_phy_reg(hw, IGP01E1000_GMII_FIFO,
- phy_data);
- if (ret_val)
- return ret_val;
- }
-
- /* When LPLU is enabled we should disable SmartSpeed */
- ret_val =
- e1000_read_phy_reg(hw, IGP01E1000_PHY_PORT_CONFIG,
- &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_data &= ~IGP01E1000_PSCFR_SMART_SPEED;
- ret_val =
- e1000_write_phy_reg(hw, IGP01E1000_PHY_PORT_CONFIG,
- phy_data);
- if (ret_val)
- return ret_val;
-
- }
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_set_vco_speed
- * @hw: Struct containing variables accessed by shared code
- *
- * Change VCO speed register to improve Bit Error Rate performance of SERDES.
- */
-static s32 e1000_set_vco_speed(struct e1000_hw *hw)
-{
- s32 ret_val;
- u16 default_page = 0;
- u16 phy_data;
-
- e_dbg("e1000_set_vco_speed");
-
- switch (hw->mac_type) {
- case e1000_82545_rev_3:
- case e1000_82546_rev_3:
- break;
- default:
- return E1000_SUCCESS;
- }
-
- /* Set PHY register 30, page 5, bit 8 to 0 */
-
- ret_val =
- e1000_read_phy_reg(hw, M88E1000_PHY_PAGE_SELECT, &default_page);
- if (ret_val)
- return ret_val;
-
- ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_PAGE_SELECT, 0x0005);
- if (ret_val)
- return ret_val;
-
- ret_val = e1000_read_phy_reg(hw, M88E1000_PHY_GEN_CONTROL, &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_data &= ~M88E1000_PHY_VCO_REG_BIT8;
- ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_GEN_CONTROL, phy_data);
- if (ret_val)
- return ret_val;
-
- /* Set PHY register 30, page 4, bit 11 to 1 */
-
- ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_PAGE_SELECT, 0x0004);
- if (ret_val)
- return ret_val;
-
- ret_val = e1000_read_phy_reg(hw, M88E1000_PHY_GEN_CONTROL, &phy_data);
- if (ret_val)
- return ret_val;
-
- phy_data |= M88E1000_PHY_VCO_REG_BIT11;
- ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_GEN_CONTROL, phy_data);
- if (ret_val)
- return ret_val;
-
- ret_val =
- e1000_write_phy_reg(hw, M88E1000_PHY_PAGE_SELECT, default_page);
- if (ret_val)
- return ret_val;
-
- return E1000_SUCCESS;
-}
-
-
-/**
- * e1000_enable_mng_pass_thru - check for bmc pass through
- * @hw: Struct containing variables accessed by shared code
- *
- * Verifies the hardware needs to allow ARPs to be processed by the host
- * returns: - true/false
- */
-u32 e1000_enable_mng_pass_thru(struct e1000_hw *hw)
-{
- u32 manc;
-
- if (hw->asf_firmware_present) {
- manc = er32(MANC);
-
- if (!(manc & E1000_MANC_RCV_TCO_EN) ||
- !(manc & E1000_MANC_EN_MAC_ADDR_FILTER))
- return false;
- if ((manc & E1000_MANC_SMBUS_EN) && !(manc & E1000_MANC_ASF_EN))
- return true;
- }
- return false;
-}
-
-static s32 e1000_polarity_reversal_workaround(struct e1000_hw *hw)
-{
- s32 ret_val;
- u16 mii_status_reg;
- u16 i;
-
- /* Polarity reversal workaround for forced 10F/10H links. */
-
- /* Disable the transmitter on the PHY */
-
- ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_PAGE_SELECT, 0x0019);
- if (ret_val)
- return ret_val;
- ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_GEN_CONTROL, 0xFFFF);
- if (ret_val)
- return ret_val;
-
- ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_PAGE_SELECT, 0x0000);
- if (ret_val)
- return ret_val;
-
- /* This loop will early-out if the NO link condition has been met. */
- for (i = PHY_FORCE_TIME; i > 0; i--) {
- /* Read the MII Status Register and wait for Link Status bit
- * to be clear.
- */
-
- ret_val = e1000_read_phy_reg(hw, PHY_STATUS, &mii_status_reg);
- if (ret_val)
- return ret_val;
-
- ret_val = e1000_read_phy_reg(hw, PHY_STATUS, &mii_status_reg);
- if (ret_val)
- return ret_val;
-
- if ((mii_status_reg & ~MII_SR_LINK_STATUS) == 0)
- break;
- mdelay(100);
- }
-
- /* Recommended delay time after link has been lost */
- mdelay(1000);
-
- /* Now we will re-enable th transmitter on the PHY */
-
- ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_PAGE_SELECT, 0x0019);
- if (ret_val)
- return ret_val;
- mdelay(50);
- ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_GEN_CONTROL, 0xFFF0);
- if (ret_val)
- return ret_val;
- mdelay(50);
- ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_GEN_CONTROL, 0xFF00);
- if (ret_val)
- return ret_val;
- mdelay(50);
- ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_GEN_CONTROL, 0x0000);
- if (ret_val)
- return ret_val;
-
- ret_val = e1000_write_phy_reg(hw, M88E1000_PHY_PAGE_SELECT, 0x0000);
- if (ret_val)
- return ret_val;
-
- /* This loop will early-out if the link condition has been met. */
- for (i = PHY_FORCE_TIME; i > 0; i--) {
- /* Read the MII Status Register and wait for Link Status bit
- * to be set.
- */
-
- ret_val = e1000_read_phy_reg(hw, PHY_STATUS, &mii_status_reg);
- if (ret_val)
- return ret_val;
-
- ret_val = e1000_read_phy_reg(hw, PHY_STATUS, &mii_status_reg);
- if (ret_val)
- return ret_val;
-
- if (mii_status_reg & MII_SR_LINK_STATUS)
- break;
- mdelay(100);
- }
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_get_auto_rd_done
- * @hw: Struct containing variables accessed by shared code
- *
- * Check for EEPROM Auto Read bit done.
- * returns: - E1000_ERR_RESET if fail to reset MAC
- * E1000_SUCCESS at any other case.
- */
-static s32 e1000_get_auto_rd_done(struct e1000_hw *hw)
-{
- e_dbg("e1000_get_auto_rd_done");
- msleep(5);
- return E1000_SUCCESS;
-}
-
-/**
- * e1000_get_phy_cfg_done
- * @hw: Struct containing variables accessed by shared code
- *
- * Checks if the PHY configuration is done
- * returns: - E1000_ERR_RESET if fail to reset MAC
- * E1000_SUCCESS at any other case.
- */
-static s32 e1000_get_phy_cfg_done(struct e1000_hw *hw)
-{
- e_dbg("e1000_get_phy_cfg_done");
- mdelay(10);
- return E1000_SUCCESS;
-}
*rg, *trg;
long chg = 0;
+ spin_lock(&resv->lock);
/* Locate the region we are either in or before. */
list_for_each_entry(rg, head, link)
if (end <= rg->to)
break;
if (&rg->link == head)
- return 0;
+ goto out;
/* If we are in the middle of a region then adjust it. */
if (end > rg->from) {
@@ -263,14 +287,19 @@ static long region_truncate(struct list_head *head, long end)
list_del(&rg->link);
kfree(rg);
}
+
+out:
+ spin_unlock(&resv->lock);
return chg;
}
-static long region_count(struct list_head *head, long f, long t)
+static long region_count(struct resv_map *resv, long f, long t)
{
+ struct list_head *head = &resv->regions;
struct file_region *rg;
long chg = 0;
+ spin_lock(&resv->lock);
/* Locate each segment we overlap with, and count that overlap. */
list_for_each_entry(rg, head, link) {
long seg_from;
@@ -286,6 +315,7 @@ static long region_count(struct list_head *head, long f, long t)
chg += seg_to - seg_from;
}
+ spin_unlock(&resv->lock);
return chg;
}
@@ -376,39 +406,46 @@ static void set_vma_private_data(struct vm_area_struct *vma,
vma->vm_private_data = (void *)value;
}
-struct resv_map {
- struct kref refs;
- struct list_head regions;
-};
-
-static struct resv_map *resv_map_alloc(void)
+struct resv_map *resv_map_alloc(void)
{
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
if (!resv_map)
return NULL;
kref_init(&resv_map->refs);
+ spin_lock_init(&resv_map->lock);
INIT_LIST_HEAD(&resv_map->regions);
return resv_map;
}
-static void resv_map_release(struct kref *ref)
+void resv_map_release(struct kref *ref)
{
struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
/* Clear out any active regions before we release the map. */
- region_truncate(&resv_map->regions, 0);
+ region_truncate(resv_map, 0);
kfree(resv_map);
}
+static inline struct resv_map *inode_resv_map(struct inode *inode)
+{
+ return inode->i_mapping->private_data;
+}
+
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
{
VM_BUG_ON(!is_vm_hugetlb_page(vma));
- if (!(vma->vm_flags & VM_MAYSHARE))
+ if (vma->vm_flags & VM_MAYSHARE) {
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ struct inode *inode = mapping->host;
+
+ return inode_resv_map(inode);
+
+ } else {
return (struct resv_map *)(get_vma_private_data(vma) &
~HPAGE_RESV_MASK);
- return NULL;
+ }
}
static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
@@ -507,7 +544,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
- if (hugepages_treat_as_movable || hugepage_migration_support(h))
+ if (hugepages_treat_as_movable || hugepage_migration_supported(h))
return GFP_HIGHUSER_MOVABLE;
else
return GFP_HIGHUSER;
@@ -540,7 +577,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
goto err;
retry_cpuset:
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
zonelist = huge_zonelist(vma, address,
htlb_alloc_mask(h), &mpol, &nodemask);
@@ -562,7 +599,7 @@ retry_cpuset:
}
mpol_cond_put(mpol);
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
@@ -570,25 +607,242 @@ err:
return NULL;
}
+/*
+ * common helper functions for hstate_next_node_to_{alloc|free}.
+ * We may have allocated or freed a huge page based on a different
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
+ * be outside of *nodes_allowed. Ensure that we use an allowed
+ * node for alloc or free.
+ */
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ nid = next_node(nid, *nodes_allowed);
+ if (nid == MAX_NUMNODES)
+ nid = first_node(*nodes_allowed);
+ VM_BUG_ON(nid >= MAX_NUMNODES);
+
+ return nid;
+}
+
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+ if (!node_isset(nid, *nodes_allowed))
+ nid = next_node_allowed(nid, nodes_allowed);
+ return nid;
+}
+
+/*
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+ nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
+ h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+/*
+ * helper for free_pool_huge_page() - return the previously saved
+ * node ["this node"] from which to free a huge page. Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
+ */
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
+{
+ int nid;
+
+ VM_BUG_ON(!nodes_allowed);
+
+ nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
+ h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+
+ return nid;
+}
+
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
+ nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
+ for (nr_nodes = nodes_weight(*mask); \
+ nr_nodes > 0 && \
+ ((node = hstate_next_node_to_free(hs, mask)) || 1); \
+ nr_nodes--)
+
+#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
+static void destroy_compound_gigantic_page(struct page *page,
+ unsigned long order)
+{
+ int i;
+ int nr_pages = 1 << order;
+ struct page *p = page + 1;
+
+ for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ __ClearPageTail(p);
+ set_page_refcounted(p);
+ p->first_page = NULL;
+ }
+
+ set_compound_order(page, 0);
+ __ClearPageHead(page);
+}
+
+static void free_gigantic_page(struct page *page, unsigned order)
+{
+ free_contig_range(page_to_pfn(page), 1 << order);
+}
+
+static int __alloc_gigantic_page(unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long end_pfn = start_pfn + nr_pages;
+ return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+}
+
+static bool pfn_range_valid_gigantic(unsigned long start_pfn,
+ unsigned long nr_pages)
+{
+ unsigned long i, end_pfn = start_pfn + nr_pages;
+ struct page *page;
+
+ for (i = start_pfn; i < end_pfn; i++) {
+ if (!pfn_valid(i))
+ return false;
+
+ page = pfn_to_page(i);
+
+ if (PageReserved(page))
+ return false;
+
+ if (page_count(page) > 0)
+ return false;
+
+ if (PageHuge(page))
+ return false;
+ }
+
+ return true;
+}
+
+static bool zone_spans_last_pfn(const struct zone *zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ unsigned long last_pfn = start_pfn + nr_pages - 1;
+ return zone_spans_pfn(zone, last_pfn);
+}
+
+static struct page *alloc_gigantic_page(int nid, unsigned order)
+{
+ unsigned long nr_pages = 1 << order;
+ unsigned long ret, pfn, flags;
+ struct zone *z;
+
+ z = NODE_DATA(nid)->node_zones;
+ for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
+ spin_lock_irqsave(&z->lock, flags);
+
+ pfn = ALIGN(z->zone_start_pfn, nr_pages);
+ while (zone_spans_last_pfn(z, pfn, nr_pages)) {
+ if (pfn_range_valid_gigantic(pfn, nr_pages)) {
+ /*
+ * We release the zone lock here because
+ * alloc_contig_range() will also lock the zone
+ * at some point. If there's an allocation
+ * spinning on this lock, it may win the race
+ * and cause alloc_contig_range() to fail...
+ */
+ spin_unlock_irqrestore(&z->lock, flags);
+ ret = __alloc_gigantic_page(pfn, nr_pages);
+ if (!ret)
+ return pfn_to_page(pfn);
+ spin_lock_irqsave(&z->lock, flags);
+ }
+ pfn += nr_pages;
+ }
+
+ spin_unlock_irqrestore(&z->lock, flags);
+ }
+
+ return NULL;
+}
+
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
+static void prep_compound_gigantic_page(struct page *page, unsigned long order);
+
+static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
+{
+ struct page *page;
+
+ page = alloc_gigantic_page(nid, huge_page_order(h));
+ if (page) {
+ prep_compound_gigantic_page(page, huge_page_order(h));
+ prep_new_huge_page(h, page, nid);
+ }
+
+ return page;
+}
+
+static int alloc_fresh_gigantic_page(struct hstate *h,
+ nodemask_t *nodes_allowed)
+{
+ struct page *page = NULL;
+ int nr_nodes, node;
+
+ for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+ page = alloc_fresh_gigantic_page_node(h, node);
+ if (page)
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline bool gigantic_page_supported(void) { return true; }
+#else
+static inline bool gigantic_page_supported(void) { return false; }
+static inline void free_gigantic_page(struct page *page, unsigned order) { }
+static inline void destroy_compound_gigantic_page(struct page *page,
+ unsigned long order) { }
+static inline int alloc_fresh_gigantic_page(struct hstate *h,
+ nodemask_t *nodes_allowed) { return 0; }
+#endif
+
static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
- VM_BUG_ON(h->order >= MAX_ORDER);
+ if (hstate_is_gigantic(h) && !gigantic_page_supported())
+ return;
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
for (i = 0; i < pages_per_huge_page(h); i++) {
page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
- 1 << PG_active | 1 << PG_reserved |
- 1 << PG_private | 1 << PG_writeback);
+ 1 << PG_active | 1 << PG_private |
+ 1 << PG_writeback);
}
- VM_BUG_ON(hugetlb_cgroup_from_page(page));
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
- arch_release_hugepage(page);
- __free_pages(page, huge_page_order(h));
+ if (hstate_is_gigantic(h)) {
+ destroy_compound_gigantic_page(page, huge_page_order(h));
+ free_gigantic_page(page, huge_page_order(h));
+ } else {
+ arch_release_hugepage(page);
+ __free_pages(page, huge_page_order(h));
+ }
}
struct hstate *size_to_hstate(unsigned long size)
@@ -602,7 +856,7 @@ struct hstate *size_to_hstate(unsigned long size)
return NULL;
}
-static void free_huge_page(struct page *page)
+void free_huge_page(struct page *page)
{
/*
* Can't pass hstate in here because it is called from the
@@ -627,7 +881,7 @@ static void free_huge_page(struct page *page)
if (restore_reserve)
h->resv_huge_pages++;
- if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+ if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
list_del(&page->lru);
update_and_free_page(h, page);
@@ -690,15 +944,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
*/
int PageHuge(struct page *page)
{
- compound_page_dtor *dtor;
-
if (!PageCompound(page))
return 0;
page = compound_head(page);
- dtor = get_compound_page_dtor(page);
-
- return dtor == free_huge_page;
+ return get_compound_page_dtor(page) == free_huge_page;
}
EXPORT_SYMBOL_GPL(PageHuge);
@@ -708,16 +958,11 @@ EXPORT_SYMBOL_GPL(PageHuge);
*/
int PageHeadHuge(struct page *page_head)
{
- compound_page_dtor *dtor;
-
if (!PageHead(page_head))
return 0;
- dtor = get_compound_page_dtor(page_head);
-
- return dtor == free_huge_page;
+ return get_compound_page_dtor(page_head) == free_huge_page;
}
-EXPORT_SYMBOL_GPL(PageHeadHuge);
pgoff_t __basepage_index(struct page *page)
{
@@ -740,9 +985,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
- if (h->order >= MAX_ORDER)
- return NULL;
-
page = alloc_pages_exact_node(nid,
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN,
@@ -758,79 +1000,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
return page;
}
-/*
- * common helper functions for hstate_next_node_to_{alloc|free}.
- * We may have allocated or freed a huge page based on a different
- * nodes_allowed previously, so h->next_node_to_{alloc|free} might
- * be outside of *nodes_allowed. Ensure that we use an allowed
- * node for alloc or free.
- */
-static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
- nid = next_node(nid, *nodes_allowed);
- if (nid == MAX_NUMNODES)
- nid = first_node(*nodes_allowed);
- VM_BUG_ON(nid >= MAX_NUMNODES);
-
- return nid;
-}
-
-static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
-{
- if (!node_isset(nid, *nodes_allowed))
- nid = next_node_allowed(nid, nodes_allowed);
- return nid;
-}
-
-/*
- * returns the previously saved node ["this node"] from which to
- * allocate a persistent huge page for the pool and advance the
- * next node from which to allocate, handling wrap at end of node
- * mask.
- */
-static int hstate_next_node_to_alloc(struct hstate *h,
- nodemask_t *nodes_allowed)
-{
- int nid;
-
- VM_BUG_ON(!nodes_allowed);
-
- nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
- h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
-
- return nid;
-}
-
-/*
- * helper for free_pool_huge_page() - return the previously saved
- * node ["this node"] from which to free a huge page. Advance the
- * next node id whether or not we find a free huge page to free so
- * that the next attempt to free addresses the next node.
- */
-static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
-{
- int nid;
-
- VM_BUG_ON(!nodes_allowed);
-
- nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
- h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
-
- return nid;
-}
-
-#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
- for (nr_nodes = nodes_weight(*mask); \
- nr_nodes > 0 && \
- ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
- nr_nodes--)
-
-#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
- for (nr_nodes = nodes_weight(*mask); \
- nr_nodes > 0 && \
- ((node = hstate_next_node_to_free(hs, mask)) || 1); \
- nr_nodes--)
-
static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
{
struct page *page;
@@ -934,7 +1103,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
struct page *page;
unsigned int r_nid;
- if (h->order >= MAX_ORDER)
+ if (hstate_is_gigantic(h))
return NULL;
/*
@@ -1098,7 +1267,7 @@ retry:
* no users -- drop the buddy allocator's reference.
*/
put_page_testzero(page);
- VM_BUG_ON(page_count(page));
+ VM_BUG_ON_PAGE(page_count(page), page);
enqueue_huge_page(h, page);
}
free:
@@ -1127,7 +1296,7 @@ static void return_unused_surplus_pages(struct hstate *h,
h->resv_huge_pages -= unused_resv_pages;
/* Cannot return gigantic pages currently */
- if (h->order >= MAX_ORDER)
+ if (hstate_is_gigantic(h))
return;
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
@@ -1143,6 +1312,7 @@ static void return_unused_surplus_pages(struct hstate *h,
while (nr_pages--) {
if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
break;
+ cond_resched_lock(&hugetlb_lock);
}
}
@@ -1159,45 +1329,34 @@ static void return_unused_surplus_pages(struct hstate *h,
static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- struct address_space *mapping = vma->vm_file->f_mapping;
- struct inode *inode = mapping->host;
-
- if (vma->vm_flags & VM_MAYSHARE) {
- pgoff_t idx = vma_hugecache_offset(h, vma, addr);
- return region_chg(&inode->i_mapping->private_list,
- idx, idx + 1);
+ struct resv_map *resv;
+ pgoff_t idx;
+ long chg;
- } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ resv = vma_resv_map(vma);
+ if (!resv)
return 1;
- } else {
- long err;
- pgoff_t idx = vma_hugecache_offset(h, vma, addr);
- struct resv_map *resv = vma_resv_map(vma);
+ idx = vma_hugecache_offset(h, vma, addr);
+ chg = region_chg(resv, idx, idx + 1);
- err = region_chg(&resv->regions, idx, idx + 1);
- if (err < 0)
- return err;
- return 0;
- }
+ if (vma->vm_flags & VM_MAYSHARE)
+ return chg;
+ else
+ return chg < 0 ? chg : 0;
}
static void vma_commit_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- struct address_space *mapping = vma->vm_file->f_mapping;
- struct inode *inode = mapping->host;
-
- if (vma->vm_flags & VM_MAYSHARE) {
- pgoff_t idx = vma_hugecache_offset(h, vma, addr);
- region_add(&inode->i_mapping->private_list, idx, idx + 1);
+ struct resv_map *resv;
+ pgoff_t idx;
- } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
- pgoff_t idx = vma_hugecache_offset(h, vma, addr);
- struct resv_map *resv = vma_resv_map(vma);
+ resv = vma_resv_map(vma);
+ if (!resv)
+ return;
- /* Mark this page used in the map. */
- region_add(&resv->regions, idx, idx + 1);
- }
+ idx = vma_hugecache_offset(h, vma, addr);
+ region_add(resv, idx, idx + 1);
}
static struct page *alloc_huge_page(struct vm_area_struct *vma,
@@ -1227,24 +1386,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
return ERR_PTR(-ENOSPC);
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
- if (ret) {
- if (chg || avoid_reserve)
- hugepage_subpool_put_pages(spool, 1);
- return ERR_PTR(-ENOSPC);
- }
+ if (ret)
+ goto out_subpool_put;
+
spin_lock(&hugetlb_lock);
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
if (!page) {
spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
- if (!page) {
- hugetlb_cgroup_uncharge_cgroup(idx,
- pages_per_huge_page(h),
- h_cg);
- if (chg || avoid_reserve)
- hugepage_subpool_put_pages(spool, 1);
- return ERR_PTR(-ENOSPC);
- }
+ if (!page)
+ goto out_uncharge_cgroup;
+
spin_lock(&hugetlb_lock);
list_move(&page->lru, &h->hugepage_activelist);
/* Fall through */
@@ -1256,6 +1408,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
vma_commit_reservation(h, vma, addr);
return page;
+
+out_uncharge_cgroup:
+ hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
+out_subpool_put:
+ if (chg || avoid_reserve)
+ hugepage_subpool_put_pages(spool, 1);
+ return ERR_PTR(-ENOSPC);
}
/*
@@ -1280,9 +1439,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
void *addr;
- addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
- huge_page_size(h), huge_page_size(h), 0);
-
+ addr = memblock_virt_alloc_try_nid_nopanic(
+ huge_page_size(h), huge_page_size(h),
+ 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
if (addr) {
/*
* Use the beginning of the huge page to store the
@@ -1303,7 +1462,7 @@ found:
return 1;
}
-static void prep_compound_huge_page(struct page *page, int order)
+static void __init prep_compound_huge_page(struct page *page, int order)
{
if (unlikely(order > (MAX_ORDER - 1)))
prep_compound_gigantic_page(page, order);
@@ -1322,8 +1481,8 @@ static void __init gather_bootmem_prealloc(void)
#ifdef CONFIG_HIGHMEM
page = pfn_to_page(m->phys >> PAGE_SHIFT);
- free_bootmem_late((unsigned long)m,
- sizeof(struct huge_bootmem_page));
+ memblock_free_late(__pa(m),
+ sizeof(struct huge_bootmem_page));
#else
page = virt_to_page(m);
#endif
@@ -1337,7 +1496,7 @@ static void __init gather_bootmem_prealloc(void)
* fix confusing memory reports from free(1) and another
* side-effects, like CommitLimit going negative.
*/
- if (h->order > (MAX_ORDER - 1))
+ if (hstate_is_gigantic(h))
adjust_managed_page_count(page, 1 << h->order);
}
}
@@ -1347,7 +1506,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
unsigned long i;
for (i = 0; i < h->max_huge_pages; ++i) {
- if (h->order >= MAX_ORDER) {
+ if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
@@ -1363,7 +1522,7 @@ static void __init hugetlb_init_hstates(void)
for_each_hstate(h) {
/* oversize hugepages were init'ed in early boot */
- if (h->order < MAX_ORDER)
+ if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
}
}
@@ -1397,7 +1556,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
{
int i;
- if (h->order >= MAX_ORDER)
+ if (hstate_is_gigantic(h))
return;
for_each_node_mask(i, *nodes_allowed) {
@@ -1460,7 +1619,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
{
unsigned long min_count, ret;
- if (h->order >= MAX_ORDER)
+ if (hstate_is_gigantic(h) && !gigantic_page_supported())
return h->max_huge_pages;
/*
@@ -1487,7 +1646,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* and reducing the surplus.
*/
spin_unlock(&hugetlb_lock);
- ret = alloc_fresh_huge_page(h, nodes_allowed);
+ if (hstate_is_gigantic(h))
+ ret = alloc_fresh_gigantic_page(h, nodes_allowed);
+ else
+ ret = alloc_fresh_huge_page(h, nodes_allowed);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
@@ -1518,6 +1680,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
while (min_count < persistent_huge_pages(h)) {
if (!free_pool_huge_page(h, nodes_allowed, 0))
break;
+ cond_resched_lock(&hugetlb_lock);
}
while (count < persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, 1))
@@ -1586,7 +1749,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
goto out;
h = kobj_to_hstate(kobj, &nid);
- if (h->order >= MAX_ORDER) {
+ if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
err = -EINVAL;
goto out;
}
@@ -1669,7 +1832,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
- if (h->order >= MAX_ORDER)
+ if (hstate_is_gigantic(h))
return -EINVAL;
err = kstrtoul(buf, 10, &input);
@@ -1953,16 +2116,15 @@ static void __exit hugetlb_exit(void)
}
kobject_put(hugepages_kobj);
+ kfree(htlb_fault_mutex_table);
}
module_exit(hugetlb_exit);
static int __init hugetlb_init(void)
{
- /* Some platform decide whether they support huge pages at boot
- * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
- * there is no such support
- */
- if (HPAGE_SHIFT == 0)
+ int i;
+
+ if (!hugepages_supported())
return 0;
if (!size_to_hstate(default_hstate_size)) {
@@ -1982,6 +2144,17 @@ static int __init hugetlb_init(void)
hugetlb_register_all_nodes();
hugetlb_cgroup_file_init();
+#ifdef CONFIG_SMP
+ num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
+#else
+ num_fault_mutexes = 1;
+#endif
+ htlb_fault_mutex_table =
+ kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
+ BUG_ON(!htlb_fault_mutex_table);
+
+ for (i = 0; i < num_fault_mutexes; i++)
+ mutex_init(&htlb_fault_mutex_table[i]);
return 0;
}
module_init(hugetlb_init);
@@ -2078,9 +2251,12 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
unsigned long tmp;
int ret;
+ if (!hugepages_supported())
+ return -ENOTSUPP;
+
tmp = h->max_huge_pages;
- if (write && h->order >= MAX_ORDER)
+ if (write && hstate_is_gigantic(h) && !gigantic_page_supported())
return -EINVAL;
table->data = &tmp;
@@ -2131,9 +2307,12 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
unsigned long tmp;
int ret;
+ if (!hugepages_supported())
+ return -ENOTSUPP;
+
tmp = h->nr_overcommit_huge_pages;
- if (write && h->order >= MAX_ORDER)
+ if (write && hstate_is_gigantic(h))
return -EINVAL;
table->data = &tmp;
@@ -2156,6 +2335,8 @@ out:
void hugetlb_report_meminfo(struct seq_file *m)
{
struct hstate *h = &default_hstate;
+ if (!hugepages_supported())
+ return;
seq_printf(m,
"HugePages_Total: %5lu\n"
"HugePages_Free: %5lu\n"
@@ -2172,6 +2353,8 @@ void hugetlb_report_meminfo(struct seq_file *m)
int hugetlb_report_node_meminfo(int nid, char *buf)
{
struct hstate *h = &default_hstate;
+ if (!hugepages_supported())
+ return 0;
return sprintf(buf,
"Node %d HugePages_Total: %5u\n"
"Node %d HugePages_Free: %5u\n"
@@ -2186,6 +2369,9 @@ void hugetlb_show_meminfo(void)
struct hstate *h;
int nid;
+ if (!hugepages_supported())
+ return;
+
for_each_node_state(nid, N_MEMORY)
for_each_hstate(h)
pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
@@ -2260,41 +2446,30 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
* after this open call completes. It is therefore safe to take a
* new reference here without additional locking.
*/
- if (resv)
+ if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_get(&resv->refs);
}
-static void resv_map_put(struct vm_area_struct *vma)
-{
- struct resv_map *resv = vma_resv_map(vma);
-
- if (!resv)
- return;
- kref_put(&resv->refs, resv_map_release);
-}
-
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
struct resv_map *resv = vma_resv_map(vma);
struct hugepage_subpool *spool = subpool_vma(vma);
- unsigned long reserve;
- unsigned long start;
- unsigned long end;
+ unsigned long reserve, start, end;
- if (resv) {
- start = vma_hugecache_offset(h, vma, vma->vm_start);
- end = vma_hugecache_offset(h, vma, vma->vm_end);
+ if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ return;
- reserve = (end - start) -
- region_count(&resv->regions, start, end);
+ start = vma_hugecache_offset(h, vma, vma->vm_start);
+ end = vma_hugecache_offset(h, vma, vma->vm_end);
- resv_map_put(vma);
+ reserve = (end - start) - region_count(resv, start, end);
- if (reserve) {
- hugetlb_acct_memory(h, -reserve);
- hugepage_subpool_put_pages(spool, reserve);
- }
+ kref_put(&resv->refs, resv_map_release);
+
+ if (reserve) {
+ hugetlb_acct_memory(h, -reserve);
+ hugepage_subpool_put_pages(spool, reserve);
}
}
@@ -2345,6 +2520,31 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
update_mmu_cache(vma, address, ptep);
}
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_migration_entry(swp))
+ return 1;
+ else
+ return 0;
+}
+
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp))
+ return 1;
+ else
+ return 0;
+}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
@@ -2355,17 +2555,27 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
int cow;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+ int ret = 0;
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+ mmun_start = vma->vm_start;
+ mmun_end = vma->vm_end;
+ if (cow)
+ mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
+
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
src_pte = huge_pte_offset(src, addr);
if (!src_pte)
continue;
dst_pte = huge_pte_alloc(dst, addr, sz);
- if (!dst_pte)
- goto nomem;
+ if (!dst_pte) {
+ ret = -ENOMEM;
+ break;
+ }
/* If the pagetables are shared don't copy or take references */
if (dst_pte == src_pte)
@@ -2374,7 +2584,24 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
- if (!huge_pte_none(huge_ptep_get(src_pte))) {
+ entry = huge_ptep_get(src_pte);
+ if (huge_pte_none(entry)) { /* skip none entry */
+ ;
+ } else if (unlikely(is_hugetlb_entry_migration(entry) ||
+ is_hugetlb_entry_hwpoisoned(entry))) {
+ swp_entry_t swp_entry = pte_to_swp_entry(entry);
+
+ if (is_write_migration_entry(swp_entry) && cow) {
+ /*
+ * COW mappings require pages in both
+ * parent and child to be set to read.
+ */
+ make_migration_entry_read(&swp_entry);
+ entry = swp_entry_to_pte(swp_entry);
+ set_huge_pte_at(src, addr, src_pte, entry);
+ }
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ } else {
if (cow)
huge_ptep_set_wrprotect(src, addr, src_pte);
entry = huge_ptep_get(src_pte);
@@ -2386,36 +2613,11 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
}
- return 0;
-nomem:
- return -ENOMEM;
-}
+ if (cow)
+ mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
-static int is_hugetlb_entry_migration(pte_t pte)
-{
- swp_entry_t swp;
-
- if (huge_pte_none(pte) || pte_present(pte))
- return 0;
- swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_migration_entry(swp))
- return 1;
- else
- return 0;
-}
-
-static int is_hugetlb_entry_hwpoisoned(pte_t pte)
-{
- swp_entry_t swp;
-
- if (huge_pte_none(pte) || pte_present(pte))
- return 0;
- swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_hwpoison_entry(swp))
- return 1;
- else
- return 0;
+ return ret;
}
void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
@@ -2659,7 +2861,8 @@ retry_avoidcopy:
BUG_ON(huge_pte_none(pte));
spin_lock(ptl);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
- if (likely(pte_same(huge_ptep_get(ptep), pte)))
+ if (likely(ptep &&
+ pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
/*
* race occurs while re-acquiring page table
@@ -2703,7 +2906,7 @@ retry_avoidcopy:
*/
spin_lock(ptl);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
- if (likely(pte_same(huge_ptep_get(ptep), pte))) {
+ if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
ClearPagePrivate(new_page);
/* Break COW */
@@ -2759,15 +2962,14 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
}
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep, unsigned int flags)
+ struct address_space *mapping, pgoff_t idx,
+ unsigned long address, pte_t *ptep, unsigned int flags)
{
struct hstate *h = hstate_vma(vma);
int ret = VM_FAULT_SIGBUS;
int anon_rmap = 0;
- pgoff_t idx;
unsigned long size;
struct page *page;
- struct address_space *mapping;
pte_t new_pte;
spinlock_t *ptl;
@@ -2782,9 +2984,6 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
return ret;
}
- mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, address);
-
/*
* Use page lock to guard against racing truncation
* before we get page_table_lock.
@@ -2869,8 +3068,7 @@ retry:
if (anon_rmap) {
ClearPagePrivate(page);
hugepage_add_new_anon_rmap(page, vma, address);
- }
- else
+ } else
page_dup_rmap(page);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
@@ -2894,17 +3092,53 @@ backout_unlocked:
goto out;
}
+#ifdef CONFIG_SMP
+static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ struct address_space *mapping,
+ pgoff_t idx, unsigned long address)
+{
+ unsigned long key[2];
+ u32 hash;
+
+ if (vma->vm_flags & VM_SHARED) {
+ key[0] = (unsigned long) mapping;
+ key[1] = idx;
+ } else {
+ key[0] = (unsigned long) mm;
+ key[1] = address >> huge_page_shift(h);
+ }
+
+ hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
+
+ return hash & (num_fault_mutexes - 1);
+}
+#else
+/*
+ * For uniprocesor systems we always use a single mutex, so just
+ * return 0 and avoid the hashing overhead.
+ */
+static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ struct address_space *mapping,
+ pgoff_t idx, unsigned long address)
+{
+ return 0;
+}
+#endif
+
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
- pte_t *ptep;
- pte_t entry;
+ pte_t *ptep, entry;
spinlock_t *ptl;
int ret;
+ u32 hash;
+ pgoff_t idx;
struct page *page = NULL;
struct page *pagecache_page = NULL;
- static DEFINE_MUTEX(hugetlb_instantiation_mutex);
struct hstate *h = hstate_vma(vma);
+ struct address_space *mapping;
address &= huge_page_mask(h);
@@ -2923,15 +3157,20 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!ptep)
return VM_FAULT_OOM;
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- mutex_lock(&hugetlb_instantiation_mutex);
+ hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
+ mutex_lock(&htlb_fault_mutex_table[hash]);
+
entry = huge_ptep_get(ptep);
if (huge_pte_none(entry)) {
- ret = hugetlb_no_page(mm, vma, address, ptep, flags);
+ ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
goto out_mutex;
}
@@ -3000,8 +3239,7 @@ out_ptl:
put_page(page);
out_mutex:
- mutex_unlock(&hugetlb_instantiation_mutex);
-
+ mutex_unlock(&htlb_fault_mutex_table[hash]);
return ret;
}
@@ -3079,7 +3317,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
- get_page(pages[i]);
+ get_page_foll(pages[i]);
}
if (vmas)
@@ -3118,6 +3356,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
@@ -3147,6 +3386,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
*/
flush_tlb_range(vma, start, end);
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ mmu_notifier_invalidate_range_end(mm, start, end);
return pages << h->order;
}
@@ -3159,6 +3399,7 @@ int hugetlb_reserve_pages(struct inode *inode,
long ret, chg;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
+ struct resv_map *resv_map;
/*
* Only apply hugepage reservation if asked. At fault time, an
@@ -3174,10 +3415,13 @@ int hugetlb_reserve_pages(struct inode *inode,
* to reserve the full area even if read-only as mprotect() may be
* called to make the mapping read-write. Assume !vma is a shm mapping
*/
- if (!vma || vma->vm_flags & VM_MAYSHARE)
- chg = region_chg(&inode->i_mapping->private_list, from, to);
- else {
- struct resv_map *resv_map = resv_map_alloc();
+ if (!vma || vma->vm_flags & VM_MAYSHARE) {
+ resv_map = inode_resv_map(inode);
+
+ chg = region_chg(resv_map, from, to);
+
+ } else {
+ resv_map = resv_map_alloc();
if (!resv_map)
return -ENOMEM;
@@ -3220,20 +3464,23 @@ int hugetlb_reserve_pages(struct inode *inode,
* else has to be done for private mappings here
*/
if (!vma || vma->vm_flags & VM_MAYSHARE)
- region_add(&inode->i_mapping->private_list, from, to);
+ region_add(resv_map, from, to);
return 0;
out_err:
- if (vma)
- resv_map_put(vma);
+ if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ kref_put(&resv_map->refs, resv_map_release);
return ret;
}
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
{
struct hstate *h = hstate_inode(inode);
- long chg = region_truncate(&inode->i_mapping->private_list, offset);
+ struct resv_map *resv_map = inode_resv_map(inode);
+ long chg = 0;
struct hugepage_subpool *spool = subpool_inode(inode);
+ if (resv_map)
+ chg = region_truncate(resv_map, offset);
spin_lock(&inode->i_lock);
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
@@ -3444,7 +3691,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
/* Can be overriden by architectures */
-__attribute__((weak)) struct page *
+struct page * __weak
follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int write)
{
@@ -3501,7 +3748,7 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
bool isolate_huge_page(struct page *page, struct list_head *list)
{
- VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
if (!get_page_unless_zero(page))
return false;
spin_lock(&hugetlb_lock);
@@ -3512,7 +3759,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
void putback_active_hugepage(struct page *page)
{
- VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
spin_lock(&hugetlb_lock);
list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
spin_unlock(&hugetlb_lock);
@@ -3521,7 +3768,7 @@ void putback_active_hugepage(struct page *page)
bool is_hugepage_active(struct page *page)
{
- VM_BUG_ON(!PageHuge(page));
+ VM_BUG_ON_PAGE(!PageHuge(page), page);
/*
* This function can be called for a tail page because the caller,
* scan_movable_pages, scans through a given pfn-range which typically
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index bda8e44f6fd..493f758445e 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -30,7 +30,6 @@ struct hugetlb_cgroup {
#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
-struct cgroup_subsys hugetlb_subsys __read_mostly;
static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
static inline
@@ -42,7 +41,7 @@ struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
static inline
struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
{
- return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id));
+ return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
}
static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
@@ -53,7 +52,7 @@ static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
static inline struct hugetlb_cgroup *
parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
{
- return hugetlb_cgroup_from_css(css_parent(&h_cg->css));
+ return hugetlb_cgroup_from_css(h_cg->css.parent);
}
static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
@@ -182,7 +181,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
again:
rcu_read_lock();
h_cg = hugetlb_cgroup_from_task(current);
- if (!css_tryget(&h_cg->css)) {
+ if (!css_tryget_online(&h_cg->css)) {
rcu_read_unlock();
goto again;
}
@@ -242,33 +241,28 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
return;
}
-static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct file *file,
- char __user *buf, size_t nbytes,
- loff_t *ppos)
+static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
- u64 val;
- char str[64];
- int idx, name, len;
+ int idx, name;
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
idx = MEMFILE_IDX(cft->private);
name = MEMFILE_ATTR(cft->private);
- val = res_counter_read_u64(&h_cg->hugepage[idx], name);
- len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
- return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+ return res_counter_read_u64(&h_cg->hugepage[idx], name);
}
-static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
- struct cftype *cft, const char *buffer)
+static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
int idx, name, ret;
unsigned long long val;
- struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
- idx = MEMFILE_IDX(cft->private);
- name = MEMFILE_ATTR(cft->private);
+ buf = strstrip(buf);
+ idx = MEMFILE_IDX(of_cft(of)->private);
+ name = MEMFILE_ATTR(of_cft(of)->private);
switch (name) {
case RES_LIMIT:
@@ -278,7 +272,7 @@ static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
break;
}
/* This function does all necessary parse...reuse it */
- ret = res_counter_memparse_write_strategy(buffer, &val);
+ ret = res_counter_memparse_write_strategy(buf, &val);
if (ret)
break;
ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
@@ -287,17 +281,17 @@ static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
ret = -EINVAL;
break;
}
- return ret;
+ return ret ?: nbytes;
}
-static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css,
- unsigned int event)
+static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
int idx, name, ret = 0;
- struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
- idx = MEMFILE_IDX(event);
- name = MEMFILE_ATTR(event);
+ idx = MEMFILE_IDX(of_cft(of)->private);
+ name = MEMFILE_ATTR(of_cft(of)->private);
switch (name) {
case RES_MAX_USAGE:
@@ -310,7 +304,7 @@ static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css,
ret = -EINVAL;
break;
}
- return ret;
+ return ret ?: nbytes;
}
static char *mem_fmt(char *buf, int size, unsigned long hsize)
@@ -337,34 +331,34 @@ static void __init __hugetlb_cgroup_file_init(int idx)
cft = &h->cgroup_files[0];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
- cft->read = hugetlb_cgroup_read;
- cft->write_string = hugetlb_cgroup_write;
+ cft->read_u64 = hugetlb_cgroup_read_u64;
+ cft->write = hugetlb_cgroup_write;
/* Add the usage file */
cft = &h->cgroup_files[1];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
- cft->read = hugetlb_cgroup_read;
+ cft->read_u64 = hugetlb_cgroup_read_u64;
/* Add the MAX usage file */
cft = &h->cgroup_files[2];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
- cft->trigger = hugetlb_cgroup_reset;
- cft->read = hugetlb_cgroup_read;
+ cft->write = hugetlb_cgroup_reset;
+ cft->read_u64 = hugetlb_cgroup_read_u64;
/* Add the failcntfile */
cft = &h->cgroup_files[3];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
- cft->trigger = hugetlb_cgroup_reset;
- cft->read = hugetlb_cgroup_read;
+ cft->write = hugetlb_cgroup_reset;
+ cft->read_u64 = hugetlb_cgroup_read_u64;
/* NULL terminate the last cft */
cft = &h->cgroup_files[4];
memset(cft, 0, sizeof(*cft));
- WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
+ WARN_ON(cgroup_add_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files));
return;
}
@@ -396,7 +390,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
if (hugetlb_cgroup_disabled())
return;
- VM_BUG_ON(!PageHuge(oldhpage));
+ VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage);
spin_lock(&hugetlb_lock);
h_cg = hugetlb_cgroup_from_page(oldhpage);
set_hugetlb_cgroup(oldhpage, NULL);
@@ -408,10 +402,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
return;
}
-struct cgroup_subsys hugetlb_subsys = {
- .name = "hugetlb",
+struct cgroup_subsys hugetlb_cgrp_subsys = {
.css_alloc = hugetlb_cgroup_css_alloc,
.css_offline = hugetlb_cgroup_css_offline,
.css_free = hugetlb_cgroup_css_free,
- .subsys_id = hugetlb_subsys_id,
};
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 4c84678371e..95487c71cad 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -55,7 +55,7 @@ static int hwpoison_inject(void *data, u64 val)
return 0;
inject:
- printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
+ pr_info("Injecting memory failure at pfn %#lx\n", pfn);
return memory_failure(pfn, 18, MF_COUNT_INCREASED);
}
diff --git a/mm/internal.h b/mm/internal.h
index 684f7aa9692..7f22a11fcc6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,6 +11,7 @@
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H
+#include <linux/fs.h>
#include <linux/mm.h>
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
@@ -21,14 +22,28 @@ static inline void set_page_count(struct page *page, int v)
atomic_set(&page->_count, v);
}
+extern int __do_page_cache_readahead(struct address_space *mapping,
+ struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+ unsigned long lookahead_size);
+
+/*
+ * Submit IO for the read-ahead request in file_ra_state.
+ */
+static inline unsigned long ra_submit(struct file_ra_state *ra,
+ struct address_space *mapping, struct file *filp)
+{
+ return __do_page_cache_readahead(mapping, filp,
+ ra->start, ra->size, ra->async_size);
+}
+
/*
* Turn a non-refcounted page (->_count == 0) into refcounted with
* a count of one.
*/
static inline void set_page_refcounted(struct page *page)
{
- VM_BUG_ON(PageTail(page));
- VM_BUG_ON(atomic_read(&page->_count));
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(atomic_read(&page->_count), page);
set_page_count(page, 1);
}
@@ -46,12 +61,10 @@ static inline void __get_page_tail_foll(struct page *page,
* speculative page access (like in
* page_cache_get_speculative()) on tail pages.
*/
- VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
- VM_BUG_ON(atomic_read(&page->_count) != 0);
- VM_BUG_ON(page_mapcount(page) < 0);
+ VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page);
if (get_page_head)
atomic_inc(&page->first_page->_count);
- atomic_inc(&page->_mapcount);
+ get_huge_page_tail(page);
}
/*
@@ -73,7 +86,7 @@ static inline void get_page_foll(struct page *page)
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
*/
- VM_BUG_ON(atomic_read(&page->_count) <= 0);
+ VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
atomic_inc(&page->_count);
}
}
@@ -85,7 +98,6 @@ extern unsigned long highest_memmap_pfn;
*/
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
-extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern bool zone_reclaimable(struct zone *zone);
/*
@@ -101,6 +113,7 @@ extern void prep_compound_page(struct page *page, unsigned long order);
#ifdef CONFIG_MEMORY_FAILURE
extern bool is_free_buddy_page(struct page *page);
#endif
+extern int user_min_free_kbytes;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -121,7 +134,7 @@ struct compact_control {
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
- bool sync; /* Synchronous migration */
+ enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
bool finished_update_free; /* True when the zone cached pfns are
* no longer being updated
@@ -131,7 +144,10 @@ struct compact_control {
int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
- bool contended; /* True if a lock was contended */
+ bool contended; /* True if a lock was contended, or
+ * need_resched() true during async
+ * compaction
+ */
};
unsigned long
@@ -144,9 +160,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
#endif
/*
- * function for dealing with page's order in buddy system.
- * zone->lock is already acquired when we use these.
- * So, we don't need atomic page->flags operations here.
+ * This function returns the order of a free page in the buddy system. In
+ * general, page_zone(page)->lock must be held by the caller to prevent the
+ * page from being allocated in parallel and returning garbage as the order.
+ * If a caller does not hold page_zone(page)->lock, it must guarantee that the
+ * page cannot be allocated or merged in parallel.
*/
static inline unsigned long page_order(struct page *page)
{
@@ -154,6 +172,11 @@ static inline unsigned long page_order(struct page *page)
return page_private(page);
}
+static inline bool is_cow_mapping(vm_flags_t flags)
+{
+ return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+}
+
/* mm/util.c */
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent);
@@ -169,26 +192,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
}
/*
- * Called only in fault path, to determine if a new page is being
- * mapped into a LOCKED vma. If it is, mark page as mlocked.
- */
-static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
- struct page *page)
-{
- VM_BUG_ON(PageLRU(page));
-
- if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
- return 0;
-
- if (!TestSetPageMlocked(page)) {
- mod_zone_page_state(page_zone(page), NR_MLOCK,
- hpage_nr_pages(page));
- count_vm_event(UNEVICTABLE_PGMLOCKED);
- }
- return 1;
-}
-
-/*
* must be called with vma's mmap_sem held for read or write, and page locked.
*/
extern void mlock_vma_page(struct page *page);
@@ -230,10 +233,6 @@ extern unsigned long vma_address(struct page *page,
struct vm_area_struct *vma);
#endif
#else /* !CONFIG_MMU */
-static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
-{
- return 0;
-}
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void mlock_migrate_page(struct page *new, struct page *old) { }
@@ -370,5 +369,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
+#define ALLOC_FAIR 0x100 /* fair zone allocation */
#endif /* __MM_INTERNAL_H */
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
new file mode 100644
index 00000000000..7b5dbd1517b
--- /dev/null
+++ b/mm/iov_iter.c
@@ -0,0 +1,743 @@
+#include <linux/export.h>
+#include <linux/uio.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i)
+{
+ size_t skip, copy, left, wanted;
+ const struct iovec *iov;
+ char __user *buf;
+ void *kaddr, *from;
+
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ wanted = bytes;
+ iov = i->iov;
+ skip = i->iov_offset;
+ buf = iov->iov_base + skip;
+ copy = min(bytes, iov->iov_len - skip);
+
+ if (!fault_in_pages_writeable(buf, copy)) {
+ kaddr = kmap_atomic(page);
+ from = kaddr + offset;
+
+ /* first chunk, usually the only one */
+ left = __copy_to_user_inatomic(buf, from, copy);
+ copy -= left;
+ skip += copy;
+ from += copy;
+ bytes -= copy;
+
+ while (unlikely(!left && bytes)) {
+ iov++;
+ buf = iov->iov_base;
+ copy = min(bytes, iov->iov_len);
+ left = __copy_to_user_inatomic(buf, from, copy);
+ copy -= left;
+ skip = copy;
+ from += copy;
+ bytes -= copy;
+ }
+ if (likely(!bytes)) {
+ kunmap_atomic(kaddr);
+ goto done;
+ }
+ offset = from - kaddr;
+ buf += copy;
+ kunmap_atomic(kaddr);
+ copy = min(bytes, iov->iov_len - skip);
+ }
+ /* Too bad - revert to non-atomic kmap */
+ kaddr = kmap(page);
+ from = kaddr + offset;
+ left = __copy_to_user(buf, from, copy);
+ copy -= left;
+ skip += copy;
+ from += copy;
+ bytes -= copy;
+ while (unlikely(!left && bytes)) {
+ iov++;
+ buf = iov->iov_base;
+ copy = min(bytes, iov->iov_len);
+ left = __copy_to_user(buf, from, copy);
+ copy -= left;
+ skip = copy;
+ from += copy;
+ bytes -= copy;
+ }
+ kunmap(page);
+done:
+ if (skip == iov->iov_len) {
+ iov++;
+ skip = 0;
+ }
+ i->count -= wanted - bytes;
+ i->nr_segs -= iov - i->iov;
+ i->iov = iov;
+ i->iov_offset = skip;
+ return wanted - bytes;
+}
+
+static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i)
+{
+ size_t skip, copy, left, wanted;
+ const struct iovec *iov;
+ char __user *buf;
+ void *kaddr, *to;
+
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ wanted = bytes;
+ iov = i->iov;
+ skip = i->iov_offset;
+ buf = iov->iov_base + skip;
+ copy = min(bytes, iov->iov_len - skip);
+
+ if (!fault_in_pages_readable(buf, copy)) {
+ kaddr = kmap_atomic(page);
+ to = kaddr + offset;
+
+ /* first chunk, usually the only one */
+ left = __copy_from_user_inatomic(to, buf, copy);
+ copy -= left;
+ skip += copy;
+ to += copy;
+ bytes -= copy;
+
+ while (unlikely(!left && bytes)) {
+ iov++;
+ buf = iov->iov_base;
+ copy = min(bytes, iov->iov_len);
+ left = __copy_from_user_inatomic(to, buf, copy);
+ copy -= left;
+ skip = copy;
+ to += copy;
+ bytes -= copy;
+ }
+ if (likely(!bytes)) {
+ kunmap_atomic(kaddr);
+ goto done;
+ }
+ offset = to - kaddr;
+ buf += copy;
+ kunmap_atomic(kaddr);
+ copy = min(bytes, iov->iov_len - skip);
+ }
+ /* Too bad - revert to non-atomic kmap */
+ kaddr = kmap(page);
+ to = kaddr + offset;
+ left = __copy_from_user(to, buf, copy);
+ copy -= left;
+ skip += copy;
+ to += copy;
+ bytes -= copy;
+ while (unlikely(!left && bytes)) {
+ iov++;
+ buf = iov->iov_base;
+ copy = min(bytes, iov->iov_len);
+ left = __copy_from_user(to, buf, copy);
+ copy -= left;
+ skip = copy;
+ to += copy;
+ bytes -= copy;
+ }
+ kunmap(page);
+done:
+ if (skip == iov->iov_len) {
+ iov++;
+ skip = 0;
+ }
+ i->count -= wanted - bytes;
+ i->nr_segs -= iov - i->iov;
+ i->iov = iov;
+ i->iov_offset = skip;
+ return wanted - bytes;
+}
+
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
+ const struct iovec *iov, size_t base, size_t bytes)
+{
+ size_t copied = 0, left = 0;
+
+ while (bytes) {
+ char __user *buf = iov->iov_base + base;
+ int copy = min(bytes, iov->iov_len - base);
+
+ base = 0;
+ left = __copy_from_user_inatomic(vaddr, buf, copy);
+ copied += copy;
+ bytes -= copy;
+ vaddr += copy;
+ iov++;
+
+ if (unlikely(left))
+ break;
+ }
+ return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were successfully copied. If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+static size_t copy_from_user_atomic_iovec(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ char *kaddr;
+ size_t copied;
+
+ kaddr = kmap_atomic(page);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+ i->iov, i->iov_offset, bytes);
+ }
+ kunmap_atomic(kaddr);
+
+ return copied;
+}
+
+static void advance_iovec(struct iov_iter *i, size_t bytes)
+{
+ BUG_ON(i->count < bytes);
+
+ if (likely(i->nr_segs == 1)) {
+ i->iov_offset += bytes;
+ i->count -= bytes;
+ } else {
+ const struct iovec *iov = i->iov;
+ size_t base = i->iov_offset;
+ unsigned long nr_segs = i->nr_segs;
+
+ /*
+ * The !iov->iov_len check ensures we skip over unlikely
+ * zero-length segments (without overruning the iovec).
+ */
+ while (bytes || unlikely(i->count && !iov->iov_len)) {
+ int copy;
+
+ copy = min(bytes, iov->iov_len - base);
+ BUG_ON(!i->count || i->count < copy);
+ i->count -= copy;
+ bytes -= copy;
+ base += copy;
+ if (iov->iov_len == base) {
+ iov++;
+ nr_segs--;
+ base = 0;
+ }
+ }
+ i->iov = iov;
+ i->iov_offset = base;
+ i->nr_segs = nr_segs;
+ }
+}
+
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+ if (!(i->type & ITER_BVEC)) {
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ bytes = min(bytes, i->iov->iov_len - i->iov_offset);
+ return fault_in_pages_readable(buf, bytes);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(iov_iter_fault_in_readable);
+
+static unsigned long alignment_iovec(const struct iov_iter *i)
+{
+ const struct iovec *iov = i->iov;
+ unsigned long res;
+ size_t size = i->count;
+ size_t n;
+
+ if (!size)
+ return 0;
+
+ res = (unsigned long)iov->iov_base + i->iov_offset;
+ n = iov->iov_len - i->iov_offset;
+ if (n >= size)
+ return res | size;
+ size -= n;
+ res |= n;
+ while (size > (++iov)->iov_len) {
+ res |= (unsigned long)iov->iov_base | iov->iov_len;
+ size -= iov->iov_len;
+ }
+ res |= (unsigned long)iov->iov_base | size;
+ return res;
+}
+
+void iov_iter_init(struct iov_iter *i, int direction,
+ const struct iovec *iov, unsigned long nr_segs,
+ size_t count)
+{
+ /* It will get better. Eventually... */
+ if (segment_eq(get_fs(), KERNEL_DS))
+ direction |= ITER_KVEC;
+ i->type = direction;
+ i->iov = iov;
+ i->nr_segs = nr_segs;
+ i->iov_offset = 0;
+ i->count = count;
+}
+EXPORT_SYMBOL(iov_iter_init);
+
+static ssize_t get_pages_iovec(struct iov_iter *i,
+ struct page **pages, size_t maxsize,
+ size_t *start)
+{
+ size_t offset = i->iov_offset;
+ const struct iovec *iov = i->iov;
+ size_t len;
+ unsigned long addr;
+ int n;
+ int res;
+
+ len = iov->iov_len - offset;
+ if (len > i->count)
+ len = i->count;
+ if (len > maxsize)
+ len = maxsize;
+ addr = (unsigned long)iov->iov_base + offset;
+ len += *start = addr & (PAGE_SIZE - 1);
+ addr &= ~(PAGE_SIZE - 1);
+ n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
+ res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
+ if (unlikely(res < 0))
+ return res;
+ return (res == n ? len : res * PAGE_SIZE) - *start;
+}
+
+static ssize_t get_pages_alloc_iovec(struct iov_iter *i,
+ struct page ***pages, size_t maxsize,
+ size_t *start)
+{
+ size_t offset = i->iov_offset;
+ const struct iovec *iov = i->iov;
+ size_t len;
+ unsigned long addr;
+ void *p;
+ int n;
+ int res;
+
+ len = iov->iov_len - offset;
+ if (len > i->count)
+ len = i->count;
+ if (len > maxsize)
+ len = maxsize;
+ addr = (unsigned long)iov->iov_base + offset;
+ len += *start = addr & (PAGE_SIZE - 1);
+ addr &= ~(PAGE_SIZE - 1);
+ n = (len + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
+ if (!p)
+ p = vmalloc(n * sizeof(struct page *));
+ if (!p)
+ return -ENOMEM;
+
+ res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
+ if (unlikely(res < 0)) {
+ kvfree(p);
+ return res;
+ }
+ *pages = p;
+ return (res == n ? len : res * PAGE_SIZE) - *start;
+}
+
+static int iov_iter_npages_iovec(const struct iov_iter *i, int maxpages)
+{
+ size_t offset = i->iov_offset;
+ size_t size = i->count;
+ const struct iovec *iov = i->iov;
+ int npages = 0;
+ int n;
+
+ for (n = 0; size && n < i->nr_segs; n++, iov++) {
+ unsigned long addr = (unsigned long)iov->iov_base + offset;
+ size_t len = iov->iov_len - offset;
+ offset = 0;
+ if (unlikely(!len)) /* empty segment */
+ continue;
+ if (len > size)
+ len = size;
+ npages += (addr + len + PAGE_SIZE - 1) / PAGE_SIZE
+ - addr / PAGE_SIZE;
+ if (npages >= maxpages) /* don't bother going further */
+ return maxpages;
+ size -= len;
+ offset = 0;
+ }
+ return min(npages, maxpages);
+}
+
+static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
+{
+ char *from = kmap_atomic(page);
+ memcpy(to, from + offset, len);
+ kunmap_atomic(from);
+}
+
+static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t len)
+{
+ char *to = kmap_atomic(page);
+ memcpy(to + offset, from, len);
+ kunmap_atomic(to);
+}
+
+static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i)
+{
+ size_t skip, copy, wanted;
+ const struct bio_vec *bvec;
+ void *kaddr, *from;
+
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ wanted = bytes;
+ bvec = i->bvec;
+ skip = i->iov_offset;
+ copy = min_t(size_t, bytes, bvec->bv_len - skip);
+
+ kaddr = kmap_atomic(page);
+ from = kaddr + offset;
+ memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy);
+ skip += copy;
+ from += copy;
+ bytes -= copy;
+ while (bytes) {
+ bvec++;
+ copy = min(bytes, (size_t)bvec->bv_len);
+ memcpy_to_page(bvec->bv_page, bvec->bv_offset, from, copy);
+ skip = copy;
+ from += copy;
+ bytes -= copy;
+ }
+ kunmap_atomic(kaddr);
+ if (skip == bvec->bv_len) {
+ bvec++;
+ skip = 0;
+ }
+ i->count -= wanted - bytes;
+ i->nr_segs -= bvec - i->bvec;
+ i->bvec = bvec;
+ i->iov_offset = skip;
+ return wanted - bytes;
+}
+
+static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i)
+{
+ size_t skip, copy, wanted;
+ const struct bio_vec *bvec;
+ void *kaddr, *to;
+
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ wanted = bytes;
+ bvec = i->bvec;
+ skip = i->iov_offset;
+
+ kaddr = kmap_atomic(page);
+
+ to = kaddr + offset;
+
+ copy = min(bytes, bvec->bv_len - skip);
+
+ memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy);
+
+ to += copy;
+ skip += copy;
+ bytes -= copy;
+
+ while (bytes) {
+ bvec++;
+ copy = min(bytes, (size_t)bvec->bv_len);
+ memcpy_from_page(to, bvec->bv_page, bvec->bv_offset, copy);
+ skip = copy;
+ to += copy;
+ bytes -= copy;
+ }
+ kunmap_atomic(kaddr);
+ if (skip == bvec->bv_len) {
+ bvec++;
+ skip = 0;
+ }
+ i->count -= wanted;
+ i->nr_segs -= bvec - i->bvec;
+ i->bvec = bvec;
+ i->iov_offset = skip;
+ return wanted;
+}
+
+static size_t copy_from_user_bvec(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ char *kaddr;
+ size_t left;
+ const struct bio_vec *bvec;
+ size_t base = i->iov_offset;
+
+ kaddr = kmap_atomic(page);
+ for (left = bytes, bvec = i->bvec; left; bvec++, base = 0) {
+ size_t copy = min(left, bvec->bv_len - base);
+ if (!bvec->bv_len)
+ continue;
+ memcpy_from_page(kaddr + offset, bvec->bv_page,
+ bvec->bv_offset + base, copy);
+ offset += copy;
+ left -= copy;
+ }
+ kunmap_atomic(kaddr);
+ return bytes;
+}
+
+static void advance_bvec(struct iov_iter *i, size_t bytes)
+{
+ BUG_ON(i->count < bytes);
+
+ if (likely(i->nr_segs == 1)) {
+ i->iov_offset += bytes;
+ i->count -= bytes;
+ } else {
+ const struct bio_vec *bvec = i->bvec;
+ size_t base = i->iov_offset;
+ unsigned long nr_segs = i->nr_segs;
+
+ /*
+ * The !iov->iov_len check ensures we skip over unlikely
+ * zero-length segments (without overruning the iovec).
+ */
+ while (bytes || unlikely(i->count && !bvec->bv_len)) {
+ int copy;
+
+ copy = min(bytes, bvec->bv_len - base);
+ BUG_ON(!i->count || i->count < copy);
+ i->count -= copy;
+ bytes -= copy;
+ base += copy;
+ if (bvec->bv_len == base) {
+ bvec++;
+ nr_segs--;
+ base = 0;
+ }
+ }
+ i->bvec = bvec;
+ i->iov_offset = base;
+ i->nr_segs = nr_segs;
+ }
+}
+
+static unsigned long alignment_bvec(const struct iov_iter *i)
+{
+ const struct bio_vec *bvec = i->bvec;
+ unsigned long res;
+ size_t size = i->count;
+ size_t n;
+
+ if (!size)
+ return 0;
+
+ res = bvec->bv_offset + i->iov_offset;
+ n = bvec->bv_len - i->iov_offset;
+ if (n >= size)
+ return res | size;
+ size -= n;
+ res |= n;
+ while (size > (++bvec)->bv_len) {
+ res |= bvec->bv_offset | bvec->bv_len;
+ size -= bvec->bv_len;
+ }
+ res |= bvec->bv_offset | size;
+ return res;
+}
+
+static ssize_t get_pages_bvec(struct iov_iter *i,
+ struct page **pages, size_t maxsize,
+ size_t *start)
+{
+ const struct bio_vec *bvec = i->bvec;
+ size_t len = bvec->bv_len - i->iov_offset;
+ if (len > i->count)
+ len = i->count;
+ if (len > maxsize)
+ len = maxsize;
+ *start = bvec->bv_offset + i->iov_offset;
+
+ get_page(*pages = bvec->bv_page);
+
+ return len;
+}
+
+static ssize_t get_pages_alloc_bvec(struct iov_iter *i,
+ struct page ***pages, size_t maxsize,
+ size_t *start)
+{
+ const struct bio_vec *bvec = i->bvec;
+ size_t len = bvec->bv_len - i->iov_offset;
+ if (len > i->count)
+ len = i->count;
+ if (len > maxsize)
+ len = maxsize;
+ *start = bvec->bv_offset + i->iov_offset;
+
+ *pages = kmalloc(sizeof(struct page *), GFP_KERNEL);
+ if (!*pages)
+ return -ENOMEM;
+
+ get_page(**pages = bvec->bv_page);
+
+ return len;
+}
+
+static int iov_iter_npages_bvec(const struct iov_iter *i, int maxpages)
+{
+ size_t offset = i->iov_offset;
+ size_t size = i->count;
+ const struct bio_vec *bvec = i->bvec;
+ int npages = 0;
+ int n;
+
+ for (n = 0; size && n < i->nr_segs; n++, bvec++) {
+ size_t len = bvec->bv_len - offset;
+ offset = 0;
+ if (unlikely(!len)) /* empty segment */
+ continue;
+ if (len > size)
+ len = size;
+ npages++;
+ if (npages >= maxpages) /* don't bother going further */
+ return maxpages;
+ size -= len;
+ offset = 0;
+ }
+ return min(npages, maxpages);
+}
+
+size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i)
+{
+ if (i->type & ITER_BVEC)
+ return copy_page_to_iter_bvec(page, offset, bytes, i);
+ else
+ return copy_page_to_iter_iovec(page, offset, bytes, i);
+}
+EXPORT_SYMBOL(copy_page_to_iter);
+
+size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i)
+{
+ if (i->type & ITER_BVEC)
+ return copy_page_from_iter_bvec(page, offset, bytes, i);
+ else
+ return copy_page_from_iter_iovec(page, offset, bytes, i);
+}
+EXPORT_SYMBOL(copy_page_from_iter);
+
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ if (i->type & ITER_BVEC)
+ return copy_from_user_bvec(page, i, offset, bytes);
+ else
+ return copy_from_user_atomic_iovec(page, i, offset, bytes);
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+
+void iov_iter_advance(struct iov_iter *i, size_t size)
+{
+ if (i->type & ITER_BVEC)
+ advance_bvec(i, size);
+ else
+ advance_iovec(i, size);
+}
+EXPORT_SYMBOL(iov_iter_advance);
+
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+size_t iov_iter_single_seg_count(const struct iov_iter *i)
+{
+ if (i->nr_segs == 1)
+ return i->count;
+ else if (i->type & ITER_BVEC)
+ return min(i->count, i->iov->iov_len - i->iov_offset);
+ else
+ return min(i->count, i->bvec->bv_len - i->iov_offset);
+}
+EXPORT_SYMBOL(iov_iter_single_seg_count);
+
+unsigned long iov_iter_alignment(const struct iov_iter *i)
+{
+ if (i->type & ITER_BVEC)
+ return alignment_bvec(i);
+ else
+ return alignment_iovec(i);
+}
+EXPORT_SYMBOL(iov_iter_alignment);
+
+ssize_t iov_iter_get_pages(struct iov_iter *i,
+ struct page **pages, size_t maxsize,
+ size_t *start)
+{
+ if (i->type & ITER_BVEC)
+ return get_pages_bvec(i, pages, maxsize, start);
+ else
+ return get_pages_iovec(i, pages, maxsize, start);
+}
+EXPORT_SYMBOL(iov_iter_get_pages);
+
+ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
+ struct page ***pages, size_t maxsize,
+ size_t *start)
+{
+ if (i->type & ITER_BVEC)
+ return get_pages_alloc_bvec(i, pages, maxsize, start);
+ else
+ return get_pages_alloc_iovec(i, pages, maxsize, start);
+}
+EXPORT_SYMBOL(iov_iter_get_pages_alloc);
+
+int iov_iter_npages(const struct iov_iter *i, int maxpages)
+{
+ if (i->type & ITER_BVEC)
+ return iov_iter_npages_bvec(i, maxpages);
+ else
+ return iov_iter_npages_iovec(i, maxpages);
+}
+EXPORT_SYMBOL(iov_iter_npages);
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index ff0d9779cec..dcdcadb6953 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -18,6 +18,8 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) "kmemleak: " fmt
+
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -50,25 +52,25 @@ static int __init kmemleak_test_init(void)
printk(KERN_INFO "Kmemleak testing\n");
/* make some orphan objects */
- pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
- pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
- pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
- pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
- pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
- pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
- pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
- pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
+ pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
+ pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
+ pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
+ pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
+ pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
+ pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
+ pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
+ pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
#ifndef CONFIG_MODULES
- pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
+ pr_info("kmem_cache_alloc(files_cachep) = %p\n",
kmem_cache_alloc(files_cachep, GFP_KERNEL));
- pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
+ pr_info("kmem_cache_alloc(files_cachep) = %p\n",
kmem_cache_alloc(files_cachep, GFP_KERNEL));
#endif
- pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
- pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
- pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
- pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
- pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("vmalloc(64) = %p\n", vmalloc(64));
+ pr_info("vmalloc(64) = %p\n", vmalloc(64));
/*
* Add elements to a list. They should only appear as orphan
@@ -76,7 +78,7 @@ static int __init kmemleak_test_init(void)
*/
for (i = 0; i < 10; i++) {
elem = kzalloc(sizeof(*elem), GFP_KERNEL);
- pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem);
+ pr_info("kzalloc(sizeof(*elem)) = %p\n", elem);
if (!elem)
return -ENOMEM;
INIT_LIST_HEAD(&elem->list);
@@ -85,7 +87,7 @@ static int __init kmemleak_test_init(void)
for_each_possible_cpu(i) {
per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
- pr_info("kmemleak: kmalloc(129) = %p\n",
+ pr_info("kmalloc(129) = %p\n",
per_cpu(kmemleak_test_pointer, i));
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 31f01c5011e..3cda50c1e39 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -192,15 +192,15 @@ static struct kmem_cache *object_cache;
static struct kmem_cache *scan_area_cache;
/* set if tracing memory operations is enabled */
-static atomic_t kmemleak_enabled = ATOMIC_INIT(0);
+static int kmemleak_enabled;
/* set in the late_initcall if there were no errors */
-static atomic_t kmemleak_initialized = ATOMIC_INIT(0);
+static int kmemleak_initialized;
/* enables or disables early logging of the memory operations */
-static atomic_t kmemleak_early_log = ATOMIC_INIT(1);
+static int kmemleak_early_log = 1;
/* set if a kmemleak warning was issued */
-static atomic_t kmemleak_warning = ATOMIC_INIT(0);
+static int kmemleak_warning;
/* set if a fatal kmemleak error has occurred */
-static atomic_t kmemleak_error = ATOMIC_INIT(0);
+static int kmemleak_error;
/* minimum and maximum address that may be valid pointers */
static unsigned long min_addr = ULONG_MAX;
@@ -218,7 +218,8 @@ static int kmemleak_stack_scan = 1;
static DEFINE_MUTEX(scan_mutex);
/* setting kmemleak=on, will set this var, skipping the disable */
static int kmemleak_skip_disable;
-
+/* If there are leaks that can be reported */
+static bool kmemleak_found_leaks;
/*
* Early object allocation/freeing logging. Kmemleak is initialized after the
@@ -267,7 +268,7 @@ static void kmemleak_disable(void);
#define kmemleak_warn(x...) do { \
pr_warning(x); \
dump_stack(); \
- atomic_set(&kmemleak_warning, 1); \
+ kmemleak_warning = 1; \
} while (0)
/*
@@ -386,7 +387,7 @@ static void dump_object_info(struct kmemleak_object *object)
pr_notice(" min_count = %d\n", object->min_count);
pr_notice(" count = %d\n", object->count);
pr_notice(" flags = 0x%lx\n", object->flags);
- pr_notice(" checksum = %d\n", object->checksum);
+ pr_notice(" checksum = %u\n", object->checksum);
pr_notice(" backtrace:\n");
print_stack_trace(&trace, 4);
}
@@ -805,7 +806,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
unsigned long flags;
struct early_log *log;
- if (atomic_read(&kmemleak_error)) {
+ if (kmemleak_error) {
/* kmemleak stopped recording, just count the requests */
crt_early_log++;
return;
@@ -840,7 +841,7 @@ static void early_alloc(struct early_log *log)
unsigned long flags;
int i;
- if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr))
+ if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr))
return;
/*
@@ -893,9 +894,9 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
{
pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
- if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
create_object((unsigned long)ptr, size, min_count, gfp);
- else if (atomic_read(&kmemleak_early_log))
+ else if (kmemleak_early_log)
log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc);
@@ -919,11 +920,11 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
* Percpu allocations are only scanned and not reported as leaks
* (min_count is set to 0).
*/
- if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
for_each_possible_cpu(cpu)
create_object((unsigned long)per_cpu_ptr(ptr, cpu),
size, 0, GFP_KERNEL);
- else if (atomic_read(&kmemleak_early_log))
+ else if (kmemleak_early_log)
log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
@@ -939,9 +940,9 @@ void __ref kmemleak_free(const void *ptr)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
delete_object_full((unsigned long)ptr);
- else if (atomic_read(&kmemleak_early_log))
+ else if (kmemleak_early_log)
log_early(KMEMLEAK_FREE, ptr, 0, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free);
@@ -959,9 +960,9 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
delete_object_part((unsigned long)ptr, size);
- else if (atomic_read(&kmemleak_early_log))
+ else if (kmemleak_early_log)
log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);
@@ -979,16 +980,50 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr)
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
for_each_possible_cpu(cpu)
delete_object_full((unsigned long)per_cpu_ptr(ptr,
cpu));
- else if (atomic_read(&kmemleak_early_log))
+ else if (kmemleak_early_log)
log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_percpu);
/**
+ * kmemleak_update_trace - update object allocation stack trace
+ * @ptr: pointer to beginning of the object
+ *
+ * Override the object allocation stack trace for cases where the actual
+ * allocation place is not always useful.
+ */
+void __ref kmemleak_update_trace(const void *ptr)
+{
+ struct kmemleak_object *object;
+ unsigned long flags;
+
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+ if (!kmemleak_enabled || IS_ERR_OR_NULL(ptr))
+ return;
+
+ object = find_and_get_object((unsigned long)ptr, 1);
+ if (!object) {
+#ifdef DEBUG
+ kmemleak_warn("Updating stack trace for unknown object at %p\n",
+ ptr);
+#endif
+ return;
+ }
+
+ spin_lock_irqsave(&object->lock, flags);
+ object->trace_len = __save_stack_trace(object->trace);
+ spin_unlock_irqrestore(&object->lock, flags);
+
+ put_object(object);
+}
+EXPORT_SYMBOL(kmemleak_update_trace);
+
+/**
* kmemleak_not_leak - mark an allocated object as false positive
* @ptr: pointer to beginning of the object
*
@@ -999,9 +1034,9 @@ void __ref kmemleak_not_leak(const void *ptr)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
make_gray_object((unsigned long)ptr);
- else if (atomic_read(&kmemleak_early_log))
+ else if (kmemleak_early_log)
log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_not_leak);
@@ -1019,9 +1054,9 @@ void __ref kmemleak_ignore(const void *ptr)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
make_black_object((unsigned long)ptr);
- else if (atomic_read(&kmemleak_early_log))
+ else if (kmemleak_early_log)
log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_ignore);
@@ -1041,9 +1076,9 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (atomic_read(&kmemleak_enabled) && ptr && size && !IS_ERR(ptr))
+ if (kmemleak_enabled && ptr && size && !IS_ERR(ptr))
add_scan_area((unsigned long)ptr, size, gfp);
- else if (atomic_read(&kmemleak_early_log))
+ else if (kmemleak_early_log)
log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
}
EXPORT_SYMBOL(kmemleak_scan_area);
@@ -1061,9 +1096,9 @@ void __ref kmemleak_no_scan(const void *ptr)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
object_no_scan((unsigned long)ptr);
- else if (atomic_read(&kmemleak_early_log))
+ else if (kmemleak_early_log)
log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_no_scan);
@@ -1088,7 +1123,7 @@ static bool update_checksum(struct kmemleak_object *object)
*/
static int scan_should_stop(void)
{
- if (!atomic_read(&kmemleak_enabled))
+ if (!kmemleak_enabled)
return 1;
/*
@@ -1299,7 +1334,7 @@ static void kmemleak_scan(void)
/*
* Struct page scanning for each node.
*/
- lock_memory_hotplug();
+ get_online_mems();
for_each_online_node(i) {
unsigned long start_pfn = node_start_pfn(i);
unsigned long end_pfn = node_end_pfn(i);
@@ -1317,7 +1352,7 @@ static void kmemleak_scan(void)
scan_block(page, page + 1, NULL, 1);
}
}
- unlock_memory_hotplug();
+ put_online_mems();
/*
* Scanning the task stacks (may introduce false negatives).
@@ -1382,9 +1417,12 @@ static void kmemleak_scan(void)
}
rcu_read_unlock();
- if (new_leaks)
+ if (new_leaks) {
+ kmemleak_found_leaks = true;
+
pr_info("%d new suspected memory leaks (see "
"/sys/kernel/debug/kmemleak)\n", new_leaks);
+ }
}
@@ -1545,11 +1583,6 @@ static int kmemleak_open(struct inode *inode, struct file *file)
return seq_open(file, &kmemleak_seq_ops);
}
-static int kmemleak_release(struct inode *inode, struct file *file)
-{
- return seq_release(inode, file);
-}
-
static int dump_str_object_info(const char *str)
{
unsigned long flags;
@@ -1592,8 +1625,12 @@ static void kmemleak_clear(void)
spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
+
+ kmemleak_found_leaks = false;
}
+static void __kmemleak_do_cleanup(void);
+
/*
* File write operation to configure kmemleak at run-time. The following
* commands can be written to the /sys/kernel/debug/kmemleak file:
@@ -1606,7 +1643,8 @@ static void kmemleak_clear(void)
* disable it)
* scan - trigger a memory scan
* clear - mark all current reported unreferenced kmemleak objects as
- * grey to ignore printing them
+ * grey to ignore printing them, or free all kmemleak objects
+ * if kmemleak has been disabled.
* dump=... - dump information about the object found at the given address
*/
static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
@@ -1616,9 +1654,6 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
int buf_size;
int ret;
- if (!atomic_read(&kmemleak_enabled))
- return -EBUSY;
-
buf_size = min(size, (sizeof(buf) - 1));
if (strncpy_from_user(buf, user_buf, buf_size) < 0)
return -EFAULT;
@@ -1628,6 +1663,19 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
if (ret < 0)
return ret;
+ if (strncmp(buf, "clear", 5) == 0) {
+ if (kmemleak_enabled)
+ kmemleak_clear();
+ else
+ __kmemleak_do_cleanup();
+ goto out;
+ }
+
+ if (!kmemleak_enabled) {
+ ret = -EBUSY;
+ goto out;
+ }
+
if (strncmp(buf, "off", 3) == 0)
kmemleak_disable();
else if (strncmp(buf, "stack=on", 8) == 0)
@@ -1651,8 +1699,6 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
}
} else if (strncmp(buf, "scan", 4) == 0)
kmemleak_scan();
- else if (strncmp(buf, "clear", 5) == 0)
- kmemleak_clear();
else if (strncmp(buf, "dump=", 5) == 0)
ret = dump_str_object_info(buf + 5);
else
@@ -1674,9 +1720,19 @@ static const struct file_operations kmemleak_fops = {
.read = seq_read,
.write = kmemleak_write,
.llseek = seq_lseek,
- .release = kmemleak_release,
+ .release = seq_release,
};
+static void __kmemleak_do_cleanup(void)
+{
+ struct kmemleak_object *object;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(object, &object_list, object_list)
+ delete_object_full(object->pointer);
+ rcu_read_unlock();
+}
+
/*
* Stop the memory scanning thread and free the kmemleak internal objects if
* no previous scan thread (otherwise, kmemleak may still have some useful
@@ -1684,18 +1740,14 @@ static const struct file_operations kmemleak_fops = {
*/
static void kmemleak_do_cleanup(struct work_struct *work)
{
- struct kmemleak_object *object;
- bool cleanup = scan_thread == NULL;
-
mutex_lock(&scan_mutex);
stop_scan_thread();
- if (cleanup) {
- rcu_read_lock();
- list_for_each_entry_rcu(object, &object_list, object_list)
- delete_object_full(object->pointer);
- rcu_read_unlock();
- }
+ if (!kmemleak_found_leaks)
+ __kmemleak_do_cleanup();
+ else
+ pr_info("Kmemleak disabled without freeing internal data. "
+ "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n");
mutex_unlock(&scan_mutex);
}
@@ -1708,14 +1760,14 @@ static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
static void kmemleak_disable(void)
{
/* atomically check whether it was already invoked */
- if (atomic_cmpxchg(&kmemleak_error, 0, 1))
+ if (cmpxchg(&kmemleak_error, 0, 1))
return;
/* stop any memory operation tracing */
- atomic_set(&kmemleak_enabled, 0);
+ kmemleak_enabled = 0;
/* check whether it is too early for a kernel thread */
- if (atomic_read(&kmemleak_initialized))
+ if (kmemleak_initialized)
schedule_work(&cleanup_work);
pr_info("Kernel memory leak detector disabled\n");
@@ -1759,7 +1811,7 @@ void __init kmemleak_init(void)
#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
if (!kmemleak_skip_disable) {
- atomic_set(&kmemleak_early_log, 0);
+ kmemleak_early_log = 0;
kmemleak_disable();
return;
}
@@ -1777,12 +1829,12 @@ void __init kmemleak_init(void)
/* the kernel is still in UP mode, so disabling the IRQs is enough */
local_irq_save(flags);
- atomic_set(&kmemleak_early_log, 0);
- if (atomic_read(&kmemleak_error)) {
+ kmemleak_early_log = 0;
+ if (kmemleak_error) {
local_irq_restore(flags);
return;
} else
- atomic_set(&kmemleak_enabled, 1);
+ kmemleak_enabled = 1;
local_irq_restore(flags);
/*
@@ -1826,9 +1878,9 @@ void __init kmemleak_init(void)
log->op_type);
}
- if (atomic_read(&kmemleak_warning)) {
+ if (kmemleak_warning) {
print_log_trace(log);
- atomic_set(&kmemleak_warning, 0);
+ kmemleak_warning = 0;
}
}
}
@@ -1840,9 +1892,9 @@ static int __init kmemleak_late_init(void)
{
struct dentry *dentry;
- atomic_set(&kmemleak_initialized, 1);
+ kmemleak_initialized = 1;
- if (atomic_read(&kmemleak_error)) {
+ if (kmemleak_error) {
/*
* Some error occurred and kmemleak was disabled. There is a
* small chance that kmemleak_disable() was called immediately
diff --git a/mm/ksm.c b/mm/ksm.c
index 175fff79dc9..346ddc9e4c0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -444,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item)
static struct page *page_trans_compound_anon(struct page *page)
{
if (PageTransCompound(page)) {
- struct page *head = compound_trans_head(page);
+ struct page *head = compound_head(page);
/*
* head may actually be splitted and freed from under
* us but it's ok here.
@@ -945,7 +945,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
pmd = mm_find_pmd(mm, addr);
if (!pmd)
goto out;
- BUG_ON(pmd_trans_huge(*pmd));
mmun_start = addr;
mmun_end = addr + PAGE_SIZE;
@@ -1891,21 +1890,24 @@ struct page *ksm_might_need_to_copy(struct page *page,
return new_page;
}
-int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
{
struct stable_node *stable_node;
struct rmap_item *rmap_item;
- unsigned int mapcount = page_mapcount(page);
- int referenced = 0;
+ int ret = SWAP_AGAIN;
int search_new_forks = 0;
- VM_BUG_ON(!PageKsm(page));
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageKsm(page), page);
+
+ /*
+ * Rely on the page lock to protect against concurrent modifications
+ * to that page's node of the stable tree.
+ */
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
stable_node = page_stable_node(page);
if (!stable_node)
- return 0;
+ return ret;
again:
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
@@ -1928,113 +1930,16 @@ again:
if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
continue;
- if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
- continue;
-
- referenced += page_referenced_one(page, vma,
- rmap_item->address, &mapcount, vm_flags);
- if (!search_new_forks || !mapcount)
- break;
- }
- anon_vma_unlock_read(anon_vma);
- if (!mapcount)
- goto out;
- }
- if (!search_new_forks++)
- goto again;
-out:
- return referenced;
-}
-
-int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
-{
- struct stable_node *stable_node;
- struct rmap_item *rmap_item;
- int ret = SWAP_AGAIN;
- int search_new_forks = 0;
-
- VM_BUG_ON(!PageKsm(page));
- VM_BUG_ON(!PageLocked(page));
-
- stable_node = page_stable_node(page);
- if (!stable_node)
- return SWAP_FAIL;
-again:
- hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
- struct anon_vma *anon_vma = rmap_item->anon_vma;
- struct anon_vma_chain *vmac;
- struct vm_area_struct *vma;
-
- anon_vma_lock_read(anon_vma);
- anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
- 0, ULONG_MAX) {
- vma = vmac->vma;
- if (rmap_item->address < vma->vm_start ||
- rmap_item->address >= vma->vm_end)
- continue;
- /*
- * Initially we examine only the vma which covers this
- * rmap_item; but later, if there is still work to do,
- * we examine covering vmas in other mms: in case they
- * were forked from the original since ksmd passed.
- */
- if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- ret = try_to_unmap_one(page, vma,
- rmap_item->address, flags);
- if (ret != SWAP_AGAIN || !page_mapped(page)) {
+ ret = rwc->rmap_one(page, vma,
+ rmap_item->address, rwc->arg);
+ if (ret != SWAP_AGAIN) {
anon_vma_unlock_read(anon_vma);
goto out;
}
- }
- anon_vma_unlock_read(anon_vma);
- }
- if (!search_new_forks++)
- goto again;
-out:
- return ret;
-}
-
-#ifdef CONFIG_MIGRATION
-int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
- struct vm_area_struct *, unsigned long, void *), void *arg)
-{
- struct stable_node *stable_node;
- struct rmap_item *rmap_item;
- int ret = SWAP_AGAIN;
- int search_new_forks = 0;
-
- VM_BUG_ON(!PageKsm(page));
- VM_BUG_ON(!PageLocked(page));
-
- stable_node = page_stable_node(page);
- if (!stable_node)
- return ret;
-again:
- hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
- struct anon_vma *anon_vma = rmap_item->anon_vma;
- struct anon_vma_chain *vmac;
- struct vm_area_struct *vma;
-
- anon_vma_lock_read(anon_vma);
- anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
- 0, ULONG_MAX) {
- vma = vmac->vma;
- if (rmap_item->address < vma->vm_start ||
- rmap_item->address >= vma->vm_end)
- continue;
- /*
- * Initially we examine only the vma which covers this
- * rmap_item; but later, if there is still work to do,
- * we examine covering vmas in other mms: in case they
- * were forked from the original since ksmd passed.
- */
- if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
- continue;
-
- ret = rmap_one(page, vma, rmap_item->address, arg);
- if (ret != SWAP_AGAIN) {
+ if (rwc->done && rwc->done(page)) {
anon_vma_unlock_read(anon_vma);
goto out;
}
@@ -2047,17 +1952,18 @@ out:
return ret;
}
+#ifdef CONFIG_MIGRATION
void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
struct stable_node *stable_node;
- VM_BUG_ON(!PageLocked(oldpage));
- VM_BUG_ON(!PageLocked(newpage));
- VM_BUG_ON(newpage->mapping != oldpage->mapping);
+ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
stable_node = page_stable_node(newpage);
if (stable_node) {
- VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
+ VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
stable_node->kpfn = page_to_pfn(newpage);
/*
* newpage->mapping was set in advance; now we need smp_wmb()
@@ -2438,4 +2344,4 @@ out_free:
out:
return err;
}
-module_init(ksm_init)
+subsys_initcall(ksm_init);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 72f9decb010..f1a0db19417 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -87,11 +87,20 @@ restart:
ret = isolate(item, &nlru->lock, cb_arg);
switch (ret) {
+ case LRU_REMOVED_RETRY:
+ assert_spin_locked(&nlru->lock);
case LRU_REMOVED:
if (--nlru->nr_items == 0)
node_clear(nid, lru->active_nodes);
WARN_ON_ONCE(nlru->nr_items < 0);
isolated++;
+ /*
+ * If the lru lock has been dropped, our list
+ * traversal is now invalid and so we have to
+ * restart from scratch.
+ */
+ if (ret == LRU_REMOVED_RETRY)
+ goto restart;
break;
case LRU_ROTATE:
list_move_tail(item, &nlru->list);
@@ -103,6 +112,7 @@ restart:
* The lru lock has been dropped, our list traversal is
* now invalid and so we have to restart from scratch.
*/
+ assert_spin_locked(&nlru->lock);
goto restart;
default:
BUG();
@@ -114,7 +124,7 @@ restart:
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);
-int list_lru_init(struct list_lru *lru)
+int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
{
int i;
size_t size = sizeof(*lru->node) * nr_node_ids;
@@ -126,12 +136,14 @@ int list_lru_init(struct list_lru *lru)
nodes_clear(lru->active_nodes);
for (i = 0; i < nr_node_ids; i++) {
spin_lock_init(&lru->node[i].lock);
+ if (key)
+ lockdep_set_class(&lru->node[i].lock, key);
INIT_LIST_HEAD(&lru->node[i].list);
lru->node[i].nr_items = 0;
}
return 0;
}
-EXPORT_SYMBOL_GPL(list_lru_init);
+EXPORT_SYMBOL_GPL(list_lru_init_key);
void list_lru_destroy(struct list_lru *lru)
{
diff --git a/mm/madvise.c b/mm/madvise.c
index 539eeb96b32..a402f8fdc68 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -195,7 +195,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
for (; start < end; start += PAGE_SIZE) {
index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- page = find_get_page(mapping, index);
+ page = find_get_entry(mapping, index);
if (!radix_tree_exceptional_entry(page)) {
if (page)
page_cache_release(page);
diff --git a/mm/memblock.c b/mm/memblock.c
index 53e477bb555..6d2f219a48b 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -21,9 +21,15 @@
#include <linux/memblock.h>
#include <asm-generic/sections.h>
+#include <linux/io.h>
+
+#include "internal.h"
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
+#endif
struct memblock memblock __initdata_memblock = {
.memory.regions = memblock_memory_init_regions,
@@ -34,11 +40,20 @@ struct memblock memblock __initdata_memblock = {
.reserved.cnt = 1, /* empty dummy entry */
.reserved.max = INIT_MEMBLOCK_REGIONS,
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+ .physmem.regions = memblock_physmem_init_regions,
+ .physmem.cnt = 1, /* empty dummy entry */
+ .physmem.max = INIT_PHYSMEM_REGIONS,
+#endif
+
.bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};
int memblock_debug __initdata_memblock;
+#ifdef CONFIG_MOVABLE_NODE
+bool movable_node_enabled __initdata_memblock = false;
+#endif
static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock = 0;
@@ -91,7 +106,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Utility called from memblock_find_in_range_node(), find free area bottom-up.
*
@@ -123,7 +138,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Utility called from memblock_find_in_range_node(), find free area top-down.
*
@@ -154,11 +169,11 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
/**
* memblock_find_in_range_node - find free area in given range and node
- * @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @start: start of candidate range
+ * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Find @size free area aligned to @align in the specified range and node.
*
@@ -173,9 +188,9 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
* RETURNS:
* Found address on success, 0 on failure.
*/
-phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
- phys_addr_t end, phys_addr_t size,
- phys_addr_t align, int nid)
+phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
+ phys_addr_t align, phys_addr_t start,
+ phys_addr_t end, int nid)
{
int ret;
phys_addr_t kernel_end;
@@ -238,8 +253,8 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align)
{
- return memblock_find_in_range_node(start, end, size, align,
- MAX_NUMNODES);
+ return memblock_find_in_range_node(size, align, start, end,
+ NUMA_NO_NODE);
}
static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -255,10 +270,13 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
type->cnt = 1;
type->regions[0].base = 0;
type->regions[0].size = 0;
+ type->regions[0].flags = 0;
memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
}
}
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+
phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
phys_addr_t *addr)
{
@@ -271,6 +289,20 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
memblock.reserved.max);
}
+phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
+ phys_addr_t *addr)
+{
+ if (memblock.memory.regions == memblock_memory_init_regions)
+ return 0;
+
+ *addr = __pa(memblock.memory.regions);
+
+ return PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.memory.max);
+}
+
+#endif
+
/**
* memblock_double_array - double the size of the memblock regions array
* @type: memblock type of the regions array being doubled
@@ -405,7 +437,8 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
if (this->base + this->size != next->base ||
memblock_get_region_node(this) !=
- memblock_get_region_node(next)) {
+ memblock_get_region_node(next) ||
+ this->flags != next->flags) {
BUG_ON(this->base + this->size > next->base);
i++;
continue;
@@ -425,13 +458,15 @@ static void __init_memblock memblock_merge_regions(struct memblock_type *type)
* @base: base address of the new region
* @size: size of the new region
* @nid: node id of the new region
+ * @flags: flags of the new region
*
* Insert new memblock region [@base,@base+@size) into @type at @idx.
* @type must already have extra room to accomodate the new region.
*/
static void __init_memblock memblock_insert_region(struct memblock_type *type,
int idx, phys_addr_t base,
- phys_addr_t size, int nid)
+ phys_addr_t size,
+ int nid, unsigned long flags)
{
struct memblock_region *rgn = &type->regions[idx];
@@ -439,17 +474,19 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
rgn->base = base;
rgn->size = size;
+ rgn->flags = flags;
memblock_set_region_node(rgn, nid);
type->cnt++;
type->total_size += size;
}
/**
- * memblock_add_region - add new memblock region
+ * memblock_add_range - add new memblock region
* @type: memblock type to add new region into
* @base: base address of the new region
* @size: size of the new region
* @nid: nid of the new region
+ * @flags: flags of the new region
*
* Add new memblock region [@base,@base+@size) into @type. The new region
* is allowed to overlap with existing ones - overlaps don't affect already
@@ -459,8 +496,9 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
* RETURNS:
* 0 on success, -errno on failure.
*/
-static int __init_memblock memblock_add_region(struct memblock_type *type,
- phys_addr_t base, phys_addr_t size, int nid)
+int __init_memblock memblock_add_range(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size,
+ int nid, unsigned long flags)
{
bool insert = false;
phys_addr_t obase = base;
@@ -475,6 +513,7 @@ static int __init_memblock memblock_add_region(struct memblock_type *type,
WARN_ON(type->cnt != 1 || type->total_size);
type->regions[0].base = base;
type->regions[0].size = size;
+ type->regions[0].flags = flags;
memblock_set_region_node(&type->regions[0], nid);
type->total_size = size;
return 0;
@@ -505,7 +544,8 @@ repeat:
nr_new++;
if (insert)
memblock_insert_region(type, i++, base,
- rbase - base, nid);
+ rbase - base, nid,
+ flags);
}
/* area below @rend is dealt with, forget about it */
base = min(rend, end);
@@ -515,7 +555,8 @@ repeat:
if (base < end) {
nr_new++;
if (insert)
- memblock_insert_region(type, i, base, end - base, nid);
+ memblock_insert_region(type, i, base, end - base,
+ nid, flags);
}
/*
@@ -537,12 +578,13 @@ repeat:
int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
int nid)
{
- return memblock_add_region(&memblock.memory, base, size, nid);
+ return memblock_add_range(&memblock.memory, base, size, nid, 0);
}
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
- return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES);
+ return memblock_add_range(&memblock.memory, base, size,
+ MAX_NUMNODES, 0);
}
/**
@@ -597,7 +639,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
rgn->size -= base - rbase;
type->total_size -= base - rbase;
memblock_insert_region(type, i, rbase, base - rbase,
- memblock_get_region_node(rgn));
+ memblock_get_region_node(rgn),
+ rgn->flags);
} else if (rend > end) {
/*
* @rgn intersects from above. Split and redo the
@@ -607,7 +650,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
rgn->size -= end - rbase;
type->total_size -= end - rbase;
memblock_insert_region(type, i--, rbase, end - rbase,
- memblock_get_region_node(rgn));
+ memblock_get_region_node(rgn),
+ rgn->flags);
} else {
/* @rgn is fully contained, record it */
if (!*end_rgn)
@@ -619,8 +663,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
return 0;
}
-static int __init_memblock __memblock_remove(struct memblock_type *type,
- phys_addr_t base, phys_addr_t size)
+int __init_memblock memblock_remove_range(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size)
{
int start_rgn, end_rgn;
int i, ret;
@@ -636,43 +680,108 @@ static int __init_memblock __memblock_remove(struct memblock_type *type,
int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
{
- return __memblock_remove(&memblock.memory, base, size);
+ return memblock_remove_range(&memblock.memory, base, size);
}
+
int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n",
(unsigned long long)base,
- (unsigned long long)base + size,
+ (unsigned long long)base + size - 1,
(void *)_RET_IP_);
- return __memblock_remove(&memblock.reserved, base, size);
+ kmemleak_free_part(__va(base), size);
+ return memblock_remove_range(&memblock.reserved, base, size);
}
-int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+static int __init_memblock memblock_reserve_region(phys_addr_t base,
+ phys_addr_t size,
+ int nid,
+ unsigned long flags)
{
struct memblock_type *_rgn = &memblock.reserved;
- memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n",
+ memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
- (unsigned long long)base + size,
- (void *)_RET_IP_);
+ (unsigned long long)base + size - 1,
+ flags, (void *)_RET_IP_);
+
+ return memblock_add_range(_rgn, base, size, nid, flags);
+}
+
+int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_reserve_region(base, size, MAX_NUMNODES, 0);
+}
+
+/**
+ * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * This function isolates region [@base, @base + @size), and mark it with flag
+ * MEMBLOCK_HOTPLUG.
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
+{
+ struct memblock_type *type = &memblock.memory;
+ int i, ret, start_rgn, end_rgn;
+
+ ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+ if (ret)
+ return ret;
+
+ for (i = start_rgn; i < end_rgn; i++)
+ memblock_set_region_flags(&type->regions[i], MEMBLOCK_HOTPLUG);
- return memblock_add_region(_rgn, base, size, MAX_NUMNODES);
+ memblock_merge_regions(type);
+ return 0;
+}
+
+/**
+ * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * This function isolates region [@base, @base + @size), and clear flag
+ * MEMBLOCK_HOTPLUG for the isolated regions.
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
+{
+ struct memblock_type *type = &memblock.memory;
+ int i, ret, start_rgn, end_rgn;
+
+ ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
+ if (ret)
+ return ret;
+
+ for (i = start_rgn; i < end_rgn; i++)
+ memblock_clear_region_flags(&type->regions[i],
+ MEMBLOCK_HOTPLUG);
+
+ memblock_merge_regions(type);
+ return 0;
}
/**
- * __next_free_mem_range - next function for for_each_free_mem_range()
+ * __next__mem_range - next function for for_each_free_mem_range() etc.
* @idx: pointer to u64 loop variable
- * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @type_a: pointer to memblock_type from where the range is taken
+ * @type_b: pointer to memblock_type which excludes memory from being taken
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @out_nid: ptr to int for nid of the range, can be %NULL
*
- * Find the first free area from *@idx which matches @nid, fill the out
+ * Find the first area from *@idx which matches @nid, fill the out
* parameters, and update *@idx for the next iteration. The lower 32bit of
- * *@idx contains index into memory region and the upper 32bit indexes the
- * areas before each reserved region. For example, if reserved regions
+ * *@idx contains index into type_a and the upper 32bit indexes the
+ * areas before each region in type_b. For example, if type_b regions
* look like the following,
*
* 0:[0-16), 1:[32-48), 2:[128-130)
@@ -684,50 +793,77 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
* As both region arrays are sorted, the function advances the two indices
* in lockstep and returns each intersection.
*/
-void __init_memblock __next_free_mem_range(u64 *idx, int nid,
- phys_addr_t *out_start,
- phys_addr_t *out_end, int *out_nid)
+void __init_memblock __next_mem_range(u64 *idx, int nid,
+ struct memblock_type *type_a,
+ struct memblock_type *type_b,
+ phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid)
{
- struct memblock_type *mem = &memblock.memory;
- struct memblock_type *rsv = &memblock.reserved;
- int mi = *idx & 0xffffffff;
- int ri = *idx >> 32;
+ int idx_a = *idx & 0xffffffff;
+ int idx_b = *idx >> 32;
+
+ if (WARN_ONCE(nid == MAX_NUMNODES,
+ "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
+ for (; idx_a < type_a->cnt; idx_a++) {
+ struct memblock_region *m = &type_a->regions[idx_a];
- for ( ; mi < mem->cnt; mi++) {
- struct memblock_region *m = &mem->regions[mi];
phys_addr_t m_start = m->base;
phys_addr_t m_end = m->base + m->size;
+ int m_nid = memblock_get_region_node(m);
/* only memory regions are associated with nodes, check it */
- if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+ if (nid != NUMA_NO_NODE && nid != m_nid)
continue;
- /* scan areas before each reservation for intersection */
- for ( ; ri < rsv->cnt + 1; ri++) {
- struct memblock_region *r = &rsv->regions[ri];
- phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0;
- phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX;
+ if (!type_b) {
+ if (out_start)
+ *out_start = m_start;
+ if (out_end)
+ *out_end = m_end;
+ if (out_nid)
+ *out_nid = m_nid;
+ idx_a++;
+ *idx = (u32)idx_a | (u64)idx_b << 32;
+ return;
+ }
- /* if ri advanced past mi, break out to advance mi */
+ /* scan areas before each reservation */
+ for (; idx_b < type_b->cnt + 1; idx_b++) {
+ struct memblock_region *r;
+ phys_addr_t r_start;
+ phys_addr_t r_end;
+
+ r = &type_b->regions[idx_b];
+ r_start = idx_b ? r[-1].base + r[-1].size : 0;
+ r_end = idx_b < type_b->cnt ?
+ r->base : ULLONG_MAX;
+
+ /*
+ * if idx_b advanced past idx_a,
+ * break out to advance idx_a
+ */
if (r_start >= m_end)
break;
/* if the two regions intersect, we're done */
if (m_start < r_end) {
if (out_start)
- *out_start = max(m_start, r_start);
+ *out_start =
+ max(m_start, r_start);
if (out_end)
*out_end = min(m_end, r_end);
if (out_nid)
- *out_nid = memblock_get_region_node(m);
+ *out_nid = m_nid;
/*
- * The region which ends first is advanced
- * for the next iteration.
+ * The region which ends first is
+ * advanced for the next iteration.
*/
if (m_end <= r_end)
- mi++;
+ idx_a++;
else
- ri++;
- *idx = (u32)mi | (u64)ri << 32;
+ idx_b++;
+ *idx = (u32)idx_a | (u64)idx_b << 32;
return;
}
}
@@ -738,45 +874,80 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
}
/**
- * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
+ * __next_mem_range_rev - generic next function for for_each_*_range_rev()
+ *
+ * Finds the next range from type_a which is not marked as unsuitable
+ * in type_b.
+ *
* @idx: pointer to u64 loop variable
- * @nid: nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
+ * @type_a: pointer to memblock_type from where the range is taken
+ * @type_b: pointer to memblock_type which excludes memory from being taken
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @out_nid: ptr to int for nid of the range, can be %NULL
*
- * Reverse of __next_free_mem_range().
+ * Reverse of __next_mem_range().
*/
-void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
- phys_addr_t *out_start,
- phys_addr_t *out_end, int *out_nid)
+void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
+ struct memblock_type *type_a,
+ struct memblock_type *type_b,
+ phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid)
{
- struct memblock_type *mem = &memblock.memory;
- struct memblock_type *rsv = &memblock.reserved;
- int mi = *idx & 0xffffffff;
- int ri = *idx >> 32;
+ int idx_a = *idx & 0xffffffff;
+ int idx_b = *idx >> 32;
+
+ if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
if (*idx == (u64)ULLONG_MAX) {
- mi = mem->cnt - 1;
- ri = rsv->cnt;
+ idx_a = type_a->cnt - 1;
+ idx_b = type_b->cnt;
}
- for ( ; mi >= 0; mi--) {
- struct memblock_region *m = &mem->regions[mi];
+ for (; idx_a >= 0; idx_a--) {
+ struct memblock_region *m = &type_a->regions[idx_a];
+
phys_addr_t m_start = m->base;
phys_addr_t m_end = m->base + m->size;
+ int m_nid = memblock_get_region_node(m);
/* only memory regions are associated with nodes, check it */
- if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+ if (nid != NUMA_NO_NODE && nid != m_nid)
continue;
- /* scan areas before each reservation for intersection */
- for ( ; ri >= 0; ri--) {
- struct memblock_region *r = &rsv->regions[ri];
- phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0;
- phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX;
+ /* skip hotpluggable memory regions if needed */
+ if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
+ continue;
+
+ if (!type_b) {
+ if (out_start)
+ *out_start = m_start;
+ if (out_end)
+ *out_end = m_end;
+ if (out_nid)
+ *out_nid = m_nid;
+ idx_a++;
+ *idx = (u32)idx_a | (u64)idx_b << 32;
+ return;
+ }
+
+ /* scan areas before each reservation */
+ for (; idx_b >= 0; idx_b--) {
+ struct memblock_region *r;
+ phys_addr_t r_start;
+ phys_addr_t r_end;
+
+ r = &type_b->regions[idx_b];
+ r_start = idx_b ? r[-1].base + r[-1].size : 0;
+ r_end = idx_b < type_b->cnt ?
+ r->base : ULLONG_MAX;
+ /*
+ * if idx_b advanced past idx_a,
+ * break out to advance idx_a
+ */
- /* if ri advanced past mi, break out to advance mi */
if (r_end <= m_start)
break;
/* if the two regions intersect, we're done */
@@ -786,18 +957,17 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
if (out_end)
*out_end = min(m_end, r_end);
if (out_nid)
- *out_nid = memblock_get_region_node(m);
-
+ *out_nid = m_nid;
if (m_start >= r_start)
- mi--;
+ idx_a--;
else
- ri--;
- *idx = (u32)mi | (u64)ri << 32;
+ idx_b--;
+ *idx = (u32)idx_a | (u64)idx_b << 32;
return;
}
}
}
-
+ /* signal end of iteration */
*idx = ULLONG_MAX;
}
@@ -837,18 +1007,18 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
* memblock_set_node - set node ID on memblock regions
* @base: base of area to set node ID for
* @size: size of area to set node ID for
+ * @type: memblock type to set node ID for
* @nid: node ID to set
*
- * Set the nid of memblock memory regions in [@base,@base+@size) to @nid.
+ * Set the nid of memblock @type regions in [@base,@base+@size) to @nid.
* Regions which cross the area boundaries are split as necessary.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
- int nid)
+ struct memblock_type *type, int nid)
{
- struct memblock_type *type = &memblock.memory;
int start_rgn, end_rgn;
int i, ret;
@@ -864,25 +1034,40 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
- phys_addr_t align, phys_addr_t max_addr,
- int nid)
+static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
+ phys_addr_t align, phys_addr_t start,
+ phys_addr_t end, int nid)
{
phys_addr_t found;
- if (WARN_ON(!align))
- align = __alignof__(long long);
-
- /* align @size to avoid excessive fragmentation on reserved array */
- size = round_up(size, align);
+ if (!align)
+ align = SMP_CACHE_BYTES;
- found = memblock_find_in_range_node(0, max_addr, size, align, nid);
- if (found && !memblock_reserve(found, size))
+ found = memblock_find_in_range_node(size, align, start, end, nid);
+ if (found && !memblock_reserve(found, size)) {
+ /*
+ * The min_count is set to 0 so that memblock allocations are
+ * never reported as leaks.
+ */
+ kmemleak_alloc(__va(found), size, 0, 0);
return found;
-
+ }
return 0;
}
+phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
+ phys_addr_t start, phys_addr_t end)
+{
+ return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
+}
+
+static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
+ phys_addr_t align, phys_addr_t max_addr,
+ int nid)
+{
+ return memblock_alloc_range_nid(size, align, 0, max_addr, nid);
+}
+
phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
{
return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
@@ -890,7 +1075,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n
phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
- return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES);
+ return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
}
phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -920,6 +1105,207 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
}
+/**
+ * memblock_virt_alloc_internal - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region to allocate (phys address)
+ * @max_addr: the upper bound of the memory region to allocate (phys address)
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * The @min_addr limit is dropped if it can not be satisfied and the allocation
+ * will fall back to memory below @min_addr. Also, allocation may fall back
+ * to any node in the system if the specified node can not
+ * hold the requested memory.
+ *
+ * The allocation is performed from memory region limited by
+ * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE.
+ *
+ * The memory block is aligned on SMP_CACHE_BYTES if @align == 0.
+ *
+ * The phys address of allocated boot memory block is converted to virtual and
+ * allocated memory is reset to 0.
+ *
+ * In addition, function sets the min_count to 0 using kmemleak_alloc for
+ * allocated boot memory block, so that it is never reported as leaks.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+static void * __init memblock_virt_alloc_internal(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ phys_addr_t alloc;
+ void *ptr;
+
+ if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
+ nid = NUMA_NO_NODE;
+
+ /*
+ * Detect any accidental use of these APIs after slab is ready, as at
+ * this moment memblock may be deinitialized already and its
+ * internal data may be destroyed (after execution of free_all_bootmem)
+ */
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, nid);
+
+ if (!align)
+ align = SMP_CACHE_BYTES;
+
+ if (max_addr > memblock.current_limit)
+ max_addr = memblock.current_limit;
+
+again:
+ alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
+ nid);
+ if (alloc)
+ goto done;
+
+ if (nid != NUMA_NO_NODE) {
+ alloc = memblock_find_in_range_node(size, align, min_addr,
+ max_addr, NUMA_NO_NODE);
+ if (alloc)
+ goto done;
+ }
+
+ if (min_addr) {
+ min_addr = 0;
+ goto again;
+ } else {
+ goto error;
+ }
+
+done:
+ memblock_reserve(alloc, size);
+ ptr = phys_to_virt(alloc);
+ memset(ptr, 0, size);
+
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks. This is because many of these blocks
+ * are only referred via the physical address which is not
+ * looked up by kmemleak.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
+
+ return ptr;
+
+error:
+ return NULL;
+}
+
+/**
+ * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
+ * additional debug information (including caller info), if enabled.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid_nopanic(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+ __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+ (u64)max_addr, (void *)_RET_IP_);
+ return memblock_virt_alloc_internal(size, align, min_addr,
+ max_addr, nid);
+}
+
+/**
+ * memblock_virt_alloc_try_nid - allocate boot memory block with panicking
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public panicking version of _memblock_virt_alloc_try_nid_nopanic()
+ * which provides debug information (including caller info), if enabled,
+ * and panics if the request can not be satisfied.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ void *ptr;
+
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+ __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+ (u64)max_addr, (void *)_RET_IP_);
+ ptr = memblock_virt_alloc_internal(size, align,
+ min_addr, max_addr, nid);
+ if (ptr)
+ return ptr;
+
+ panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
+ __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+ (u64)max_addr);
+ return NULL;
+}
+
+/**
+ * __memblock_free_early - free boot memory block
+ * @base: phys starting address of the boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * Free boot memory block previously allocated by memblock_virt_alloc_xx() API.
+ * The freeing memory will not be released to the buddy allocator.
+ */
+void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
+{
+ memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
+ __func__, (u64)base, (u64)base + size - 1,
+ (void *)_RET_IP_);
+ kmemleak_free_part(__va(base), size);
+ memblock_remove_range(&memblock.reserved, base, size);
+}
+
+/*
+ * __memblock_free_late - free bootmem block pages directly to buddy allocator
+ * @addr: phys starting address of the boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system. Pages are released directly
+ * to the buddy allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
+{
+ u64 cursor, end;
+
+ memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
+ __func__, (u64)base, (u64)base + size - 1,
+ (void *)_RET_IP_);
+ kmemleak_free_part(__va(base), size);
+ cursor = PFN_UP(base);
+ end = PFN_DOWN(base + size);
+
+ for (; cursor < end; cursor++) {
+ __free_pages_bootmem(pfn_to_page(cursor), 0);
+ totalram_pages++;
+ }
+}
/*
* Remaining API functions
@@ -944,7 +1330,7 @@ phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
pages += end_pfn - start_pfn;
}
- return (phys_addr_t)pages << PAGE_SHIFT;
+ return PFN_PHYS(pages);
}
/* lowest address */
@@ -962,16 +1348,14 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
void __init memblock_enforce_memory_limit(phys_addr_t limit)
{
- unsigned long i;
phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+ struct memblock_region *r;
if (!limit)
return;
/* find out max address */
- for (i = 0; i < memblock.memory.cnt; i++) {
- struct memblock_region *r = &memblock.memory.regions[i];
-
+ for_each_memblock(memory, r) {
if (limit <= r->size) {
max_addr = r->base + limit;
break;
@@ -980,8 +1364,10 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
}
/* truncate both memory and reserved regions */
- __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX);
- __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX);
+ memblock_remove_range(&memblock.memory, max_addr,
+ (phys_addr_t)ULLONG_MAX);
+ memblock_remove_range(&memblock.reserved, max_addr,
+ (phys_addr_t)ULLONG_MAX);
}
static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
@@ -1017,14 +1403,13 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
unsigned long *start_pfn, unsigned long *end_pfn)
{
struct memblock_type *type = &memblock.memory;
- int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT);
+ int mid = memblock_search(type, PFN_PHYS(pfn));
if (mid == -1)
return -1;
- *start_pfn = type->regions[mid].base >> PAGE_SHIFT;
- *end_pfn = (type->regions[mid].base + type->regions[mid].size)
- >> PAGE_SHIFT;
+ *start_pfn = PFN_DOWN(type->regions[mid].base);
+ *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size);
return type->regions[mid].nid;
}
@@ -1070,13 +1455,12 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si
void __init_memblock memblock_trim_memory(phys_addr_t align)
{
- int i;
phys_addr_t start, end, orig_start, orig_end;
- struct memblock_type *mem = &memblock.memory;
+ struct memblock_region *r;
- for (i = 0; i < mem->cnt; i++) {
- orig_start = mem->regions[i].base;
- orig_end = mem->regions[i].base + mem->regions[i].size;
+ for_each_memblock(memory, r) {
+ orig_start = r->base;
+ orig_end = r->base + r->size;
start = round_up(orig_start, align);
end = round_down(orig_end, align);
@@ -1084,11 +1468,12 @@ void __init_memblock memblock_trim_memory(phys_addr_t align)
continue;
if (start < end) {
- mem->regions[i].base = start;
- mem->regions[i].size = end - start;
+ r->base = start;
+ r->size = end - start;
} else {
- memblock_remove_region(mem, i);
- i--;
+ memblock_remove_region(&memblock.memory,
+ r - memblock.memory.regions);
+ r--;
}
}
}
@@ -1098,9 +1483,15 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit)
memblock.current_limit = limit;
}
+phys_addr_t __init_memblock memblock_get_current_limit(void)
+{
+ return memblock.current_limit;
+}
+
static void __init_memblock memblock_dump(struct memblock_type *type, char *name)
{
unsigned long long base, size;
+ unsigned long flags;
int i;
pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
@@ -1111,13 +1502,14 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
base = rgn->base;
size = rgn->size;
+ flags = rgn->flags;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
if (memblock_get_region_node(rgn) != MAX_NUMNODES)
snprintf(nid_buf, sizeof(nid_buf), " on node %d",
memblock_get_region_node(rgn));
#endif
- pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n",
- name, i, base, base + size - 1, size, nid_buf);
+ pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
+ name, i, base, base + size - 1, size, nid_buf, flags);
}
}
@@ -1188,6 +1580,9 @@ static int __init memblock_init_debugfs(void)
return -ENXIO;
debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+ debugfs_create_file("physmem", S_IRUGO, root, &memblock.physmem, &memblock_debug_fops);
+#endif
return 0;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f1a0ae6e11b..1f14a430c65 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,16 +45,17 @@
#include <linux/swapops.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
+#include <linux/poll.h>
#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
-#include <linux/vmalloc.h>
#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
+#include <linux/file.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -65,8 +66,8 @@
#include <trace/events/vmscan.h>
-struct cgroup_subsys mem_cgroup_subsys __read_mostly;
-EXPORT_SYMBOL(mem_cgroup_subsys);
+struct cgroup_subsys memory_cgrp_subsys __read_mostly;
+EXPORT_SYMBOL(memory_cgrp_subsys);
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -79,7 +80,7 @@ int do_swap_account __read_mostly;
#ifdef CONFIG_MEMCG_SWAP_ENABLED
static int really_do_swap_account __initdata = 1;
#else
-static int really_do_swap_account __initdata = 0;
+static int really_do_swap_account __initdata;
#endif
#else
@@ -148,7 +149,7 @@ struct mem_cgroup_reclaim_iter {
* matches memcg->dead_count of the hierarchy root group.
*/
struct mem_cgroup *last_visited;
- unsigned long last_dead_count;
+ int last_dead_count;
/* scan generation, increased every round-trip */
unsigned int generation;
@@ -227,6 +228,46 @@ struct mem_cgroup_eventfd_list {
struct eventfd_ctx *eventfd;
};
+/*
+ * cgroup_event represents events which userspace want to receive.
+ */
+struct mem_cgroup_event {
+ /*
+ * memcg which the event belongs to.
+ */
+ struct mem_cgroup *memcg;
+ /*
+ * eventfd to signal userspace about the event.
+ */
+ struct eventfd_ctx *eventfd;
+ /*
+ * Each of these stored in a list by the cgroup.
+ */
+ struct list_head list;
+ /*
+ * register_event() callback will be used to add new userspace
+ * waiter for changes related to this event. Use eventfd_signal()
+ * on eventfd to send notification to userspace.
+ */
+ int (*register_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args);
+ /*
+ * unregister_event() callback will be called when userspace closes
+ * the eventfd or on cgroup removing. This callback must be set,
+ * if you want provide notification functionality.
+ */
+ void (*unregister_event)(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd);
+ /*
+ * All fields below needed to unregister event when
+ * userspace closes eventfd.
+ */
+ poll_table pt;
+ wait_queue_head_t *wqh;
+ wait_queue_t wait;
+ struct work_struct remove;
+};
+
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
@@ -316,10 +357,9 @@ struct mem_cgroup {
struct cg_proto tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
- /* analogous to slab_common's slab_caches list. per-memcg */
+ /* analogous to slab_common's slab_caches list, but per-memcg;
+ * protected by memcg_slab_mutex */
struct list_head memcg_slab_caches;
- /* Not a spinlock, we can take a lot of time walking the list */
- struct mutex slab_caches_mutex;
/* Index in the kmem_cache->memcg_params->memcg_caches array */
int kmemcg_id;
#endif
@@ -331,27 +371,20 @@ struct mem_cgroup {
atomic_t numainfo_updating;
#endif
+ /* List of events which userspace want to receive */
+ struct list_head event_list;
+ spinlock_t event_list_lock;
+
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
};
-static size_t memcg_size(void)
-{
- return sizeof(struct mem_cgroup) +
- nr_node_ids * sizeof(struct mem_cgroup_per_node);
-}
-
/* internal only representation about the status of kmem accounting. */
enum {
- KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
- KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+ KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
};
-/* We account when limit is on, but only after call sites are patched */
-#define KMEM_ACCOUNTED_MASK \
- ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
-
#ifdef CONFIG_MEMCG_KMEM
static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
{
@@ -363,16 +396,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
}
-static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
-{
- set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
-static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
-{
- clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
{
/*
@@ -490,11 +513,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
-struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
-{
- return &mem_cgroup_from_css(css)->vmpressure;
-}
-
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
return (memcg == root_mem_cgroup);
@@ -508,18 +526,14 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
- /*
- * The ID of the root cgroup is 0, but memcg treat 0 as an
- * invalid ID, so we return (cgroup_id + 1).
- */
- return memcg->css.cgroup->id + 1;
+ return memcg->css.id;
}
static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
struct cgroup_subsys_state *css;
- css = css_from_id(id - 1, &mem_cgroup_subsys);
+ css = css_from_id(id, &memory_cgrp_subsys);
return mem_cgroup_from_css(css);
}
@@ -552,7 +566,8 @@ void sock_update_memcg(struct sock *sk)
memcg = mem_cgroup_from_task(current);
cg_proto = sk->sk_prot->proto_cgroup(memcg);
if (!mem_cgroup_is_root(memcg) &&
- memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
+ memcg_proto_active(cg_proto) &&
+ css_tryget_online(&memcg->css)) {
sk->sk_cgrp = cg_proto;
}
rcu_read_unlock();
@@ -658,9 +673,11 @@ static void disarm_static_keys(struct mem_cgroup *memcg)
static void drain_all_stock_async(struct mem_cgroup *memcg);
static struct mem_cgroup_per_zone *
-mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
+mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
{
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
+ int nid = zone_to_nid(zone);
+ int zid = zone_idx(zone);
+
return &memcg->nodeinfo[nid]->zoneinfo[zid];
}
@@ -670,12 +687,12 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
}
static struct mem_cgroup_per_zone *
-page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
+mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
{
int nid = page_to_nid(page);
int zid = page_zonenum(page);
- return mem_cgroup_zoneinfo(memcg, nid, zid);
+ return &memcg->nodeinfo[nid]->zoneinfo[zid];
}
static struct mem_cgroup_tree_per_zone *
@@ -693,11 +710,9 @@ soft_limit_tree_from_page(struct page *page)
return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}
-static void
-__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
- struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz,
- unsigned long long new_usage_in_excess)
+static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz,
+ unsigned long long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
@@ -727,10 +742,8 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
mz->on_tree = true;
}
-static void
-__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
- struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz)
+static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz)
{
if (!mz->on_tree)
return;
@@ -738,13 +751,11 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
mz->on_tree = false;
}
-static void
-mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
- struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz)
+static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
+ struct mem_cgroup_tree_per_zone *mctz)
{
spin_lock(&mctz->lock);
- __mem_cgroup_remove_exceeded(memcg, mz, mctz);
+ __mem_cgroup_remove_exceeded(mz, mctz);
spin_unlock(&mctz->lock);
}
@@ -754,16 +765,14 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
unsigned long long excess;
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
- int nid = page_to_nid(page);
- int zid = page_zonenum(page);
- mctz = soft_limit_tree_from_page(page);
+ mctz = soft_limit_tree_from_page(page);
/*
* Necessary to update all ancestors when hierarchy is used.
* because their event counter is not touched.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+ mz = mem_cgroup_page_zoneinfo(memcg, page);
excess = res_counter_soft_limit_excess(&memcg->res);
/*
* We have to update the tree if mz is on RB-tree or
@@ -773,12 +782,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
spin_lock(&mctz->lock);
/* if on-tree, remove it */
if (mz->on_tree)
- __mem_cgroup_remove_exceeded(memcg, mz, mctz);
+ __mem_cgroup_remove_exceeded(mz, mctz);
/*
* Insert again. mz->usage_in_excess will be updated.
* If excess is 0, no tree ops.
*/
- __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
+ __mem_cgroup_insert_exceeded(mz, mctz, excess);
spin_unlock(&mctz->lock);
}
}
@@ -786,15 +795,15 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
- int node, zone;
- struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
+ struct mem_cgroup_per_zone *mz;
+ int nid, zid;
- for_each_node(node) {
- for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- mz = mem_cgroup_zoneinfo(memcg, node, zone);
- mctz = soft_limit_tree_node_zone(node, zone);
- mem_cgroup_remove_exceeded(memcg, mz, mctz);
+ for_each_node(nid) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+ mctz = soft_limit_tree_node_zone(nid, zid);
+ mem_cgroup_remove_exceeded(mz, mctz);
}
}
}
@@ -817,9 +826,9 @@ retry:
* we will to add it back at the end of reclaim to its correct
* position in the tree.
*/
- __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+ __mem_cgroup_remove_exceeded(mz, mctz);
if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
- !css_tryget(&mz->memcg->css))
+ !css_tryget_online(&mz->memcg->css))
goto retry;
done:
return mz;
@@ -902,8 +911,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
struct page *page,
bool anon, int nr_pages)
{
- preempt_disable();
-
/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
* counted as CACHE even if it's on ANON LRU.
@@ -928,12 +935,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
}
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
-
- preempt_enable();
}
-unsigned long
-mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
struct mem_cgroup_per_zone *mz;
@@ -941,46 +945,38 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
return mz->lru_size[lru];
}
-static unsigned long
-mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
- unsigned int lru_mask)
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid,
+ unsigned int lru_mask)
{
- struct mem_cgroup_per_zone *mz;
- enum lru_list lru;
- unsigned long ret = 0;
-
- mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-
- for_each_lru(lru) {
- if (BIT(lru) & lru_mask)
- ret += mz->lru_size[lru];
- }
- return ret;
-}
-
-static unsigned long
-mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
-{
- u64 total = 0;
+ unsigned long nr = 0;
int zid;
- for (zid = 0; zid < MAX_NR_ZONES; zid++)
- total += mem_cgroup_zone_nr_lru_pages(memcg,
- nid, zid, lru_mask);
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
- return total;
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct mem_cgroup_per_zone *mz;
+ enum lru_list lru;
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+ nr += mz->lru_size[lru];
+ }
+ }
+ return nr;
}
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
unsigned int lru_mask)
{
+ unsigned long nr = 0;
int nid;
- u64 total = 0;
for_each_node_state(nid, N_MEMORY)
- total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
- return total;
+ nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
+ return nr;
}
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
@@ -1053,26 +1049,28 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
if (unlikely(!p))
return NULL;
- return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id));
+ return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
}
-struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
struct mem_cgroup *memcg = NULL;
- if (!mm)
- return NULL;
- /*
- * Because we have no locks, mm->owner's may be being moved to other
- * cgroup. We use css_tryget() here even if this looks
- * pessimistic (rather than adding locks here).
- */
rcu_read_lock();
do {
- memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!memcg))
- break;
- } while (!css_tryget(&memcg->css));
+ /*
+ * Page cache insertions can happen withou an
+ * actual mm context, e.g. during disk probing
+ * on boot, loopback IO, acct() writes etc.
+ */
+ if (unlikely(!mm))
+ memcg = root_mem_cgroup;
+ else {
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
+ memcg = root_mem_cgroup;
+ }
+ } while (!css_tryget_online(&memcg->css));
rcu_read_unlock();
return memcg;
}
@@ -1098,16 +1096,23 @@ skip_node:
* skipped and we should continue the tree walk.
* last_visited css is safe to use because it is
* protected by css_get and the tree walk is rcu safe.
+ *
+ * We do not take a reference on the root of the tree walk
+ * because we might race with the root removal when it would
+ * be the only node in the iterated hierarchy and mem_cgroup_iter
+ * would end up in an endless loop because it expects that at
+ * least one valid node will be returned. Root cannot disappear
+ * because caller of the iterator should hold it already so
+ * skipping css reference should be safe.
*/
if (next_css) {
- struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
+ if ((next_css == &root->css) ||
+ ((next_css->flags & CSS_ONLINE) &&
+ css_tryget_online(next_css)))
+ return mem_cgroup_from_css(next_css);
- if (css_tryget(&mem->css))
- return mem;
- else {
- prev_css = next_css;
- goto skip_node;
- }
+ prev_css = next_css;
+ goto skip_node;
}
return NULL;
@@ -1141,7 +1146,15 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
if (iter->last_dead_count == *sequence) {
smp_rmb();
position = iter->last_visited;
- if (position && !css_tryget(&position->css))
+
+ /*
+ * We cannot take a reference to root because we might race
+ * with root removal and returning NULL would end up in
+ * an endless loop on the iterator user level when root
+ * would be returned all the time.
+ */
+ if (position && position != root &&
+ !css_tryget_online(&position->css))
position = NULL;
}
return position;
@@ -1150,9 +1163,11 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
struct mem_cgroup *last_visited,
struct mem_cgroup *new_position,
+ struct mem_cgroup *root,
int sequence)
{
- if (last_visited)
+ /* root reference counting symmetric to mem_cgroup_iter_load */
+ if (last_visited && last_visited != root)
css_put(&last_visited->css);
/*
* We store the sequence count from the time @last_visited was
@@ -1210,11 +1225,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
int uninitialized_var(seq);
if (reclaim) {
- int nid = zone_to_nid(reclaim->zone);
- int zid = zone_idx(reclaim->zone);
struct mem_cgroup_per_zone *mz;
- mz = mem_cgroup_zoneinfo(root, nid, zid);
+ mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
iter = &mz->reclaim_iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation) {
iter->last_visited = NULL;
@@ -1227,7 +1240,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
memcg = __mem_cgroup_iter_next(root, last_visited);
if (reclaim) {
- mem_cgroup_iter_update(iter, last_visited, memcg, seq);
+ mem_cgroup_iter_update(iter, last_visited, memcg, root,
+ seq);
if (!memcg)
iter->generation++;
@@ -1320,7 +1334,7 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
goto out;
}
- mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
+ mz = mem_cgroup_zone_zoneinfo(memcg, zone);
lruvec = &mz->lruvec;
out:
/*
@@ -1379,7 +1393,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
pc->mem_cgroup = memcg = root_mem_cgroup;
- mz = page_cgroup_zoneinfo(memcg, page);
+ mz = mem_cgroup_page_zoneinfo(memcg, page);
lruvec = &mz->lruvec;
out:
/*
@@ -1450,7 +1464,7 @@ bool task_in_mem_cgroup(struct task_struct *task,
p = find_lock_task_mm(task);
if (p) {
- curr = try_get_mem_cgroup_from_mm(p->mm);
+ curr = get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
} else {
/*
@@ -1464,8 +1478,6 @@ bool task_in_mem_cgroup(struct task_struct *task,
css_get(&curr->css);
rcu_read_unlock();
}
- if (!curr)
- return false;
/*
* We should check use_hierarchy of "memcg" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
@@ -1519,7 +1531,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
/* root ? */
- if (!css_parent(&memcg->css))
+ if (mem_cgroup_disabled() || !memcg->css.parent)
return vm_swappiness;
return memcg->swappiness;
@@ -1563,23 +1575,12 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
}
/*
- * 2 routines for checking "mem" is under move_account() or not.
- *
- * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
- * is used for avoiding races in accounting. If true,
- * pc->mem_cgroup may be overwritten.
+ * A routine for checking "mem" is under move_account() or not.
*
- * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
- * under hierarchy of moving cgroups. This is for
- * waiting at hith-memory prressure caused by "move".
+ * Checking a cgroup is mc.from or mc.to or under hierarchy of
+ * moving cgroups. This is for waiting at high-memory pressure
+ * caused by "move".
*/
-
-static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
-{
- VM_BUG_ON(!rcu_read_lock_held());
- return atomic_read(&memcg->moving_account) > 0;
-}
-
static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
{
struct mem_cgroup *from;
@@ -1622,7 +1623,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
* Take this lock when
* - a code tries to modify page's memcg while it's USED.
* - a code tries to modify page state accounting in a memcg.
- * see mem_cgroup_stolen(), too.
*/
static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
unsigned long *flags)
@@ -1647,53 +1647,25 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
- struct cgroup *task_cgrp;
- struct cgroup *mem_cgrp;
- /*
- * Need a buffer in BSS, can't rely on allocations. The code relies
- * on the assumption that OOM is serialized for memory controller.
- * If this assumption is broken, revisit this code.
- */
- static char memcg_name[PATH_MAX];
- int ret;
+ /* oom_info_lock ensures that parallel ooms do not interleave */
+ static DEFINE_MUTEX(oom_info_lock);
struct mem_cgroup *iter;
unsigned int i;
if (!p)
return;
+ mutex_lock(&oom_info_lock);
rcu_read_lock();
- mem_cgrp = memcg->css.cgroup;
- task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
+ pr_info("Task in ");
+ pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+ pr_info(" killed as a result of limit of ");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ pr_info("\n");
- ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
- if (ret < 0) {
- /*
- * Unfortunately, we are unable to convert to a useful name
- * But we'll still print out the usage information
- */
- rcu_read_unlock();
- goto done;
- }
- rcu_read_unlock();
-
- pr_info("Task in %s killed", memcg_name);
-
- rcu_read_lock();
- ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
- if (ret < 0) {
- rcu_read_unlock();
- goto done;
- }
rcu_read_unlock();
- /*
- * Continues from above, so we don't need an KERN_ level
- */
- pr_cont(" as a result of limit of %s\n", memcg_name);
-done:
-
pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
@@ -1708,13 +1680,8 @@ done:
res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
for_each_mem_cgroup_tree(iter, memcg) {
- pr_info("Memory cgroup stats");
-
- rcu_read_lock();
- ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
- if (!ret)
- pr_cont(" for %s", memcg_name);
- rcu_read_unlock();
+ pr_info("Memory cgroup stats for ");
+ pr_cont_cgroup_path(iter->css.cgroup);
pr_cont(":");
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -1730,6 +1697,7 @@ done:
pr_cont("\n");
}
+ mutex_unlock(&oom_info_lock);
}
/*
@@ -1822,13 +1790,18 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
break;
};
points = oom_badness(task, memcg, NULL, totalpages);
- if (points > chosen_points) {
- if (chosen)
- put_task_struct(chosen);
- chosen = task;
- chosen_points = points;
- get_task_struct(chosen);
- }
+ if (!points || points < chosen_points)
+ continue;
+ /* Prefer thread group leaders for display purposes */
+ if (points == chosen_points &&
+ thread_group_leader(chosen))
+ continue;
+
+ if (chosen)
+ put_task_struct(chosen);
+ chosen = task;
+ chosen_points = points;
+ get_task_struct(chosen);
}
css_task_iter_end(&it);
}
@@ -2284,12 +2257,11 @@ cleanup:
}
/*
- * Currently used to update mapped file statistics, but the routine can be
- * generalized to update other statistics as well.
+ * Used to update mapped file or writeback or other statistics.
*
* Notes: Race condition
*
- * We usually use page_cgroup_lock() for accessing page_cgroup member but
+ * We usually use lock_page_cgroup() for accessing page_cgroup member but
* it tends to be costly. But considering some conditions, we doesn't need
* to do so _always_.
*
@@ -2303,8 +2275,8 @@ cleanup:
* by flags.
*
* Considering "move", this is an only case we see a race. To make the race
- * small, we check mm->moving_account and detect there are possibility of race
- * If there is, we take a lock.
+ * small, we check memcg->moving_account and detect there are possibility
+ * of race or not. If there is, we take a lock.
*/
void __mem_cgroup_begin_update_page_stat(struct page *page,
@@ -2322,9 +2294,10 @@ again:
* If this memory cgroup is not under account moving, we don't
* need to take move_lock_mem_cgroup(). Because we already hold
* rcu_read_lock(), any calls to move_account will be delayed until
- * rcu_read_unlock() if mem_cgroup_stolen() == true.
+ * rcu_read_unlock().
*/
- if (!mem_cgroup_stolen(memcg))
+ VM_BUG_ON(!rcu_read_lock_held());
+ if (atomic_read(&memcg->moving_account) <= 0)
return;
move_lock_mem_cgroup(memcg, flags);
@@ -2432,7 +2405,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
*/
static void drain_local_stock(struct work_struct *dummy)
{
- struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
+ struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
}
@@ -2579,7 +2552,7 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
}
-/* See __mem_cgroup_try_charge() for details */
+/* See mem_cgroup_try_charge() for details */
enum {
CHARGE_OK, /* success */
CHARGE_RETRY, /* need to retry but retry is not bad */
@@ -2652,116 +2625,52 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
return CHARGE_NOMEM;
}
-/*
- * __mem_cgroup_try_charge() does
- * 1. detect memcg to be charged against from passed *mm and *ptr,
- * 2. update res_counter
- * 3. call memory reclaim if necessary.
- *
- * In some special case, if the task is fatal, fatal_signal_pending() or
- * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
- * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
- * as possible without any hazards. 2: all pages should have a valid
- * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
- * pointer, that is treated as a charge to root_mem_cgroup.
- *
- * So __mem_cgroup_try_charge() will return
- * 0 ... on success, filling *ptr with a valid memcg pointer.
- * -ENOMEM ... charge failure because of resource limits.
- * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.
+/**
+ * mem_cgroup_try_charge - try charging a memcg
+ * @memcg: memcg to charge
+ * @nr_pages: number of pages to charge
+ * @oom: trigger OOM if reclaim fails
*
- * Unlike the exported interface, an "oom" parameter is added. if oom==true,
- * the oom-killer can be invoked.
+ * Returns 0 if @memcg was charged successfully, -EINTR if the charge
+ * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
*/
-static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask,
- unsigned int nr_pages,
- struct mem_cgroup **ptr,
- bool oom)
+static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
+ gfp_t gfp_mask,
+ unsigned int nr_pages,
+ bool oom)
{
unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct mem_cgroup *memcg = NULL;
int ret;
+ if (mem_cgroup_is_root(memcg))
+ goto done;
/*
- * Unlike gloval-vm's OOM-kill, we're not in memory shortage
- * in system level. So, allow to go ahead dying process in addition to
- * MEMDIE process.
+ * Unlike in global OOM situations, memcg is not in a physical
+ * memory shortage. Allow dying and OOM-killed tasks to
+ * bypass the last charges so that they can exit quickly and
+ * free their memory.
*/
- if (unlikely(test_thread_flag(TIF_MEMDIE)
- || fatal_signal_pending(current)))
+ if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+ fatal_signal_pending(current) ||
+ current->flags & PF_EXITING))
goto bypass;
if (unlikely(task_in_memcg_oom(current)))
- goto bypass;
+ goto nomem;
- /*
- * We always charge the cgroup the mm_struct belongs to.
- * The mm_struct's mem_cgroup changes on task migration if the
- * thread group leader migrates. It's possible that mm is not
- * set, if so charge the root memcg (happens for pagecache usage).
- */
- if (!*ptr && !mm)
- *ptr = root_mem_cgroup;
+ if (gfp_mask & __GFP_NOFAIL)
+ oom = false;
again:
- if (*ptr) { /* css should be a valid one */
- memcg = *ptr;
- if (mem_cgroup_is_root(memcg))
- goto done;
- if (consume_stock(memcg, nr_pages))
- goto done;
- css_get(&memcg->css);
- } else {
- struct task_struct *p;
-
- rcu_read_lock();
- p = rcu_dereference(mm->owner);
- /*
- * Because we don't have task_lock(), "p" can exit.
- * In that case, "memcg" can point to root or p can be NULL with
- * race with swapoff. Then, we have small risk of mis-accouning.
- * But such kind of mis-account by race always happens because
- * we don't have cgroup_mutex(). It's overkill and we allo that
- * small race, here.
- * (*) swapoff at el will charge against mm-struct not against
- * task-struct. So, mm->owner can be NULL.
- */
- memcg = mem_cgroup_from_task(p);
- if (!memcg)
- memcg = root_mem_cgroup;
- if (mem_cgroup_is_root(memcg)) {
- rcu_read_unlock();
- goto done;
- }
- if (consume_stock(memcg, nr_pages)) {
- /*
- * It seems dagerous to access memcg without css_get().
- * But considering how consume_stok works, it's not
- * necessary. If consume_stock success, some charges
- * from this memcg are cached on this cpu. So, we
- * don't need to call css_get()/css_tryget() before
- * calling consume_stock().
- */
- rcu_read_unlock();
- goto done;
- }
- /* after here, we may be blocked. we need to get refcnt */
- if (!css_tryget(&memcg->css)) {
- rcu_read_unlock();
- goto again;
- }
- rcu_read_unlock();
- }
+ if (consume_stock(memcg, nr_pages))
+ goto done;
do {
bool invoke_oom = oom && !nr_oom_retries;
/* If killed, bypass charge */
- if (fatal_signal_pending(current)) {
- css_put(&memcg->css);
+ if (fatal_signal_pending(current))
goto bypass;
- }
ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
nr_pages, invoke_oom);
@@ -2770,17 +2679,12 @@ again:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
batch = nr_pages;
- css_put(&memcg->css);
- memcg = NULL;
goto again;
case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
- css_put(&memcg->css);
goto nomem;
case CHARGE_NOMEM: /* OOM routine works */
- if (!oom || invoke_oom) {
- css_put(&memcg->css);
+ if (!oom || invoke_oom)
goto nomem;
- }
nr_oom_retries--;
break;
}
@@ -2788,20 +2692,44 @@ again:
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
- css_put(&memcg->css);
done:
- *ptr = memcg;
return 0;
nomem:
- if (!(gfp_mask & __GFP_NOFAIL)) {
- *ptr = NULL;
+ if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM;
- }
bypass:
- *ptr = root_mem_cgroup;
return -EINTR;
}
+/**
+ * mem_cgroup_try_charge_mm - try charging a mm
+ * @mm: mm_struct to charge
+ * @nr_pages: number of pages to charge
+ * @oom: trigger OOM if reclaim fails
+ *
+ * Returns the charged mem_cgroup associated with the given mm_struct or
+ * NULL the charge failed.
+ */
+static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
+ gfp_t gfp_mask,
+ unsigned int nr_pages,
+ bool oom)
+
+{
+ struct mem_cgroup *memcg;
+ int ret;
+
+ memcg = get_mem_cgroup_from_mm(mm);
+ ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
+ css_put(&memcg->css);
+ if (ret == -EINTR)
+ memcg = root_mem_cgroup;
+ else if (ret)
+ memcg = NULL;
+
+ return memcg;
+}
+
/*
* Somemtimes we have to undo a charge we got by try_charge().
* This function is for that and do uncharge, put css's refcnt.
@@ -2839,9 +2767,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
/*
* A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller is responsible for calling css_tryget if
- * the mem_cgroup is used for charging. (dropping refcnt from swap can be
- * called against removed memcg.)
+ * rcu_read_lock(). The caller is responsible for calling
+ * css_tryget_online() if the mem_cgroup is used for charging. (dropping
+ * refcnt from swap can be called against removed memcg.)
*/
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
@@ -2858,20 +2786,20 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
unsigned short id;
swp_entry_t ent;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
memcg = pc->mem_cgroup;
- if (memcg && !css_tryget(&memcg->css))
+ if (memcg && !css_tryget_online(&memcg->css))
memcg = NULL;
} else if (PageSwapCache(page)) {
ent.val = page_private(page);
id = lookup_swap_cgroup_id(ent);
rcu_read_lock();
memcg = mem_cgroup_lookup(id);
- if (memcg && !css_tryget(&memcg->css))
+ if (memcg && !css_tryget_online(&memcg->css))
memcg = NULL;
rcu_read_unlock();
}
@@ -2892,7 +2820,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
bool anon;
lock_page_cgroup(pc);
- VM_BUG_ON(PageCgroupUsed(pc));
+ VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
/*
* we don't need page_cgroup_lock about tail pages, becase they are not
* accessed by any other context at this point.
@@ -2927,7 +2855,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
if (lrucare) {
if (was_on_lru) {
lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, page_lru(page));
}
@@ -2953,10 +2881,18 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
static DEFINE_MUTEX(set_limit_mutex);
#ifdef CONFIG_MEMCG_KMEM
+/*
+ * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
+ * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
+ */
+static DEFINE_MUTEX(memcg_slab_mutex);
+
+static DEFINE_MUTEX(activate_kmem_mutex);
+
static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
{
return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
- (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
+ memcg_kmem_is_active(memcg);
}
/*
@@ -2973,10 +2909,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
}
#ifdef CONFIG_SLABINFO
-static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *m)
+static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
struct memcg_cache_params *params;
if (!memcg_can_account_kmem(memcg))
@@ -2984,10 +2919,10 @@ static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
print_slabinfo_header(m);
- mutex_lock(&memcg->slab_caches_mutex);
+ mutex_lock(&memcg_slab_mutex);
list_for_each_entry(params, &memcg->memcg_slab_caches, list)
cache_show(memcg_params_to_cache(params), m);
- mutex_unlock(&memcg->slab_caches_mutex);
+ mutex_unlock(&memcg_slab_mutex);
return 0;
}
@@ -2996,20 +2931,17 @@ static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
{
struct res_counter *fail_res;
- struct mem_cgroup *_memcg;
int ret = 0;
ret = res_counter_charge(&memcg->kmem, size, &fail_res);
if (ret)
return ret;
- _memcg = memcg;
- ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
- &_memcg, oom_gfp_allowed(gfp));
-
+ ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
+ oom_gfp_allowed(gfp));
if (ret == -EINTR) {
/*
- * __mem_cgroup_try_charge() chosed to bypass to root due to
+ * mem_cgroup_try_charge() chosed to bypass to root due to
* OOM kill or fatal signal. Since our only options are to
* either fail the allocation or charge it to this cgroup, do
* it as a temporary condition. But we can't fail. From a
@@ -3019,7 +2951,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
*
* This condition will only trigger if the task entered
* memcg_charge_kmem in a sane state, but was OOM-killed during
- * __mem_cgroup_try_charge() above. Tasks that were already
+ * mem_cgroup_try_charge() above. Tasks that were already
* dying when the allocation triggers should have been already
* directed to the root cgroup in memcontrol.h
*/
@@ -3056,16 +2988,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
css_put(&memcg->css);
}
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
-{
- if (!memcg)
- return;
-
- mutex_lock(&memcg->slab_caches_mutex);
- list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
- mutex_unlock(&memcg->slab_caches_mutex);
-}
-
/*
* helper for acessing a memcg's index. It will be used as an index in the
* child cache array in kmem_cache, and also to derive its name. This function
@@ -3076,43 +2998,6 @@ int memcg_cache_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
}
-/*
- * This ends up being protected by the set_limit mutex, during normal
- * operation, because that is its main call site.
- *
- * But when we create a new cache, we can call this as well if its parent
- * is kmem-limited. That will have to hold set_limit_mutex as well.
- */
-int memcg_update_cache_sizes(struct mem_cgroup *memcg)
-{
- int num, ret;
-
- num = ida_simple_get(&kmem_limited_groups,
- 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
- if (num < 0)
- return num;
- /*
- * After this point, kmem_accounted (that we test atomically in
- * the beginning of this conditional), is no longer 0. This
- * guarantees only one process will set the following boolean
- * to true. We don't need test_and_set because we're protected
- * by the set_limit_mutex anyway.
- */
- memcg_kmem_set_activated(memcg);
-
- ret = memcg_update_all_caches(num+1);
- if (ret) {
- ida_simple_remove(&kmem_limited_groups, num);
- memcg_kmem_clear_activated(memcg);
- return ret;
- }
-
- memcg->kmemcg_id = num;
- INIT_LIST_HEAD(&memcg->memcg_slab_caches);
- mutex_init(&memcg->slab_caches_mutex);
- return 0;
-}
-
static size_t memcg_caches_array_size(int num_groups)
{
ssize_t size;
@@ -3139,8 +3024,6 @@ void memcg_update_array_size(int num)
memcg_limited_groups_array_size = memcg_caches_array_size(num);
}
-static void kmem_cache_destroy_work_func(struct work_struct *w);
-
int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
{
struct memcg_cache_params *cur_params = s->memcg_params;
@@ -3149,18 +3032,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
if (num_groups > memcg_limited_groups_array_size) {
int i;
+ struct memcg_cache_params *new_params;
ssize_t size = memcg_caches_array_size(num_groups);
size *= sizeof(void *);
size += offsetof(struct memcg_cache_params, memcg_caches);
- s->memcg_params = kzalloc(size, GFP_KERNEL);
- if (!s->memcg_params) {
- s->memcg_params = cur_params;
+ new_params = kzalloc(size, GFP_KERNEL);
+ if (!new_params)
return -ENOMEM;
- }
- s->memcg_params->is_root_cache = true;
+ new_params->is_root_cache = true;
/*
* There is the chance it will be bigger than
@@ -3174,7 +3056,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
for (i = 0; i < memcg_limited_groups_array_size; i++) {
if (!cur_params->memcg_caches[i])
continue;
- s->memcg_params->memcg_caches[i] =
+ new_params->memcg_caches[i] =
cur_params->memcg_caches[i];
}
@@ -3187,13 +3069,15 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
* bigger than the others. And all updates will reset this
* anyway.
*/
- kfree(cur_params);
+ rcu_assign_pointer(s->memcg_params, new_params);
+ if (cur_params)
+ kfree_rcu(cur_params, rcu_head);
}
return 0;
}
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
- struct kmem_cache *root_cache)
+int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
+ struct kmem_cache *root_cache)
{
size_t size;
@@ -3213,43 +3097,85 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
if (memcg) {
s->memcg_params->memcg = memcg;
s->memcg_params->root_cache = root_cache;
- INIT_WORK(&s->memcg_params->destroy,
- kmem_cache_destroy_work_func);
+ css_get(&memcg->css);
} else
s->memcg_params->is_root_cache = true;
return 0;
}
-void memcg_release_cache(struct kmem_cache *s)
+void memcg_free_cache_params(struct kmem_cache *s)
{
- struct kmem_cache *root;
- struct mem_cgroup *memcg;
+ if (!s->memcg_params)
+ return;
+ if (!s->memcg_params->is_root_cache)
+ css_put(&s->memcg_params->memcg->css);
+ kfree(s->memcg_params);
+}
+
+static void memcg_register_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *root_cache)
+{
+ static char memcg_name_buf[NAME_MAX + 1]; /* protected by
+ memcg_slab_mutex */
+ struct kmem_cache *cachep;
int id;
+ lockdep_assert_held(&memcg_slab_mutex);
+
+ id = memcg_cache_id(memcg);
+
/*
- * This happens, for instance, when a root cache goes away before we
- * add any memcg.
+ * Since per-memcg caches are created asynchronously on first
+ * allocation (see memcg_kmem_get_cache()), several threads can try to
+ * create the same cache, but only one of them may succeed.
*/
- if (!s->memcg_params)
+ if (cache_from_memcg_idx(root_cache, id))
return;
- if (s->memcg_params->is_root_cache)
- goto out;
+ cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
+ cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
+ /*
+ * If we could not create a memcg cache, do not complain, because
+ * that's not critical at all as we can always proceed with the root
+ * cache.
+ */
+ if (!cachep)
+ return;
- memcg = s->memcg_params->memcg;
- id = memcg_cache_id(memcg);
+ list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
- root = s->memcg_params->root_cache;
- root->memcg_params->memcg_caches[id] = NULL;
+ /*
+ * Since readers won't lock (see cache_from_memcg_idx()), we need a
+ * barrier here to ensure nobody will see the kmem_cache partially
+ * initialized.
+ */
+ smp_wmb();
- mutex_lock(&memcg->slab_caches_mutex);
- list_del(&s->memcg_params->list);
- mutex_unlock(&memcg->slab_caches_mutex);
+ BUG_ON(root_cache->memcg_params->memcg_caches[id]);
+ root_cache->memcg_params->memcg_caches[id] = cachep;
+}
- css_put(&memcg->css);
-out:
- kfree(s->memcg_params);
+static void memcg_unregister_cache(struct kmem_cache *cachep)
+{
+ struct kmem_cache *root_cache;
+ struct mem_cgroup *memcg;
+ int id;
+
+ lockdep_assert_held(&memcg_slab_mutex);
+
+ BUG_ON(is_root_cache(cachep));
+
+ root_cache = cachep->memcg_params->root_cache;
+ memcg = cachep->memcg_params->memcg;
+ id = memcg_cache_id(memcg);
+
+ BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
+ root_cache->memcg_params->memcg_caches[id] = NULL;
+
+ list_del(&cachep->memcg_params->list);
+
+ kmem_cache_destroy(cachep);
}
/*
@@ -3283,241 +3209,74 @@ static inline void memcg_resume_kmem_account(void)
current->memcg_kmem_skip_account--;
}
-static void kmem_cache_destroy_work_func(struct work_struct *w)
-{
- struct kmem_cache *cachep;
- struct memcg_cache_params *p;
-
- p = container_of(w, struct memcg_cache_params, destroy);
-
- cachep = memcg_params_to_cache(p);
-
- /*
- * If we get down to 0 after shrink, we could delete right away.
- * However, memcg_release_pages() already puts us back in the workqueue
- * in that case. If we proceed deleting, we'll get a dangling
- * reference, and removing the object from the workqueue in that case
- * is unnecessary complication. We are not a fast path.
- *
- * Note that this case is fundamentally different from racing with
- * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
- * kmem_cache_shrink, not only we would be reinserting a dead cache
- * into the queue, but doing so from inside the worker racing to
- * destroy it.
- *
- * So if we aren't down to zero, we'll just schedule a worker and try
- * again
- */
- if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
- kmem_cache_shrink(cachep);
- if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
- return;
- } else
- kmem_cache_destroy(cachep);
-}
-
-void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
-{
- if (!cachep->memcg_params->dead)
- return;
-
- /*
- * There are many ways in which we can get here.
- *
- * We can get to a memory-pressure situation while the delayed work is
- * still pending to run. The vmscan shrinkers can then release all
- * cache memory and get us to destruction. If this is the case, we'll
- * be executed twice, which is a bug (the second time will execute over
- * bogus data). In this case, cancelling the work should be fine.
- *
- * But we can also get here from the worker itself, if
- * kmem_cache_shrink is enough to shake all the remaining objects and
- * get the page count to 0. In this case, we'll deadlock if we try to
- * cancel the work (the worker runs with an internal lock held, which
- * is the same lock we would hold for cancel_work_sync().)
- *
- * Since we can't possibly know who got us here, just refrain from
- * running if there is already work pending
- */
- if (work_pending(&cachep->memcg_params->destroy))
- return;
- /*
- * We have to defer the actual destroying to a workqueue, because
- * we might currently be in a context that cannot sleep.
- */
- schedule_work(&cachep->memcg_params->destroy);
-}
-
-/*
- * This lock protects updaters, not readers. We want readers to be as fast as
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
-
-/*
- * Called with memcg_cache_mutex held
- */
-static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
- struct kmem_cache *s)
-{
- struct kmem_cache *new;
- static char *tmp_name = NULL;
-
- lockdep_assert_held(&memcg_cache_mutex);
-
- /*
- * kmem_cache_create_memcg duplicates the given name and
- * cgroup_name for this name requires RCU context.
- * This static temporary buffer is used to prevent from
- * pointless shortliving allocation.
- */
- if (!tmp_name) {
- tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!tmp_name)
- return NULL;
- }
-
- rcu_read_lock();
- snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
- memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
- rcu_read_unlock();
-
- new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
- (s->flags & ~SLAB_PANIC), s->ctor, s);
-
- if (new)
- new->allocflags |= __GFP_KMEMCG;
-
- return new;
-}
-
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- struct kmem_cache *new_cachep;
- int idx;
-
- BUG_ON(!memcg_can_account_kmem(memcg));
-
- idx = memcg_cache_id(memcg);
-
- mutex_lock(&memcg_cache_mutex);
- new_cachep = cache_from_memcg_idx(cachep, idx);
- if (new_cachep) {
- css_put(&memcg->css);
- goto out;
- }
-
- new_cachep = kmem_cache_dup(memcg, cachep);
- if (new_cachep == NULL) {
- new_cachep = cachep;
- css_put(&memcg->css);
- goto out;
- }
-
- atomic_set(&new_cachep->memcg_params->nr_pages , 0);
-
- cachep->memcg_params->memcg_caches[idx] = new_cachep;
- /*
- * the readers won't lock, make sure everybody sees the updated value,
- * so they won't put stuff in the queue again for no reason
- */
- wmb();
-out:
- mutex_unlock(&memcg_cache_mutex);
- return new_cachep;
-}
-
-void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+int __memcg_cleanup_cache_params(struct kmem_cache *s)
{
struct kmem_cache *c;
- int i;
-
- if (!s->memcg_params)
- return;
- if (!s->memcg_params->is_root_cache)
- return;
+ int i, failed = 0;
- /*
- * If the cache is being destroyed, we trust that there is no one else
- * requesting objects from it. Even if there are, the sanity checks in
- * kmem_cache_destroy should caught this ill-case.
- *
- * Still, we don't want anyone else freeing memcg_caches under our
- * noses, which can happen if a new memcg comes to life. As usual,
- * we'll take the set_limit_mutex to protect ourselves against this.
- */
- mutex_lock(&set_limit_mutex);
+ mutex_lock(&memcg_slab_mutex);
for_each_memcg_cache_index(i) {
c = cache_from_memcg_idx(s, i);
if (!c)
continue;
- /*
- * We will now manually delete the caches, so to avoid races
- * we need to cancel all pending destruction workers and
- * proceed with destruction ourselves.
- *
- * kmem_cache_destroy() will call kmem_cache_shrink internally,
- * and that could spawn the workers again: it is likely that
- * the cache still have active pages until this very moment.
- * This would lead us back to mem_cgroup_destroy_cache.
- *
- * But that will not execute at all if the "dead" flag is not
- * set, so flip it down to guarantee we are in control.
- */
- c->memcg_params->dead = false;
- cancel_work_sync(&c->memcg_params->destroy);
- kmem_cache_destroy(c);
+ memcg_unregister_cache(c);
+
+ if (cache_from_memcg_idx(s, i))
+ failed++;
}
- mutex_unlock(&set_limit_mutex);
+ mutex_unlock(&memcg_slab_mutex);
+ return failed;
}
-struct create_work {
- struct mem_cgroup *memcg;
- struct kmem_cache *cachep;
- struct work_struct work;
-};
-
-static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
{
struct kmem_cache *cachep;
- struct memcg_cache_params *params;
+ struct memcg_cache_params *params, *tmp;
if (!memcg_kmem_is_active(memcg))
return;
- mutex_lock(&memcg->slab_caches_mutex);
- list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
+ mutex_lock(&memcg_slab_mutex);
+ list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
cachep = memcg_params_to_cache(params);
- cachep->memcg_params->dead = true;
- schedule_work(&cachep->memcg_params->destroy);
+ kmem_cache_shrink(cachep);
+ if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+ memcg_unregister_cache(cachep);
}
- mutex_unlock(&memcg->slab_caches_mutex);
+ mutex_unlock(&memcg_slab_mutex);
}
-static void memcg_create_cache_work_func(struct work_struct *w)
+struct memcg_register_cache_work {
+ struct mem_cgroup *memcg;
+ struct kmem_cache *cachep;
+ struct work_struct work;
+};
+
+static void memcg_register_cache_func(struct work_struct *w)
{
- struct create_work *cw;
+ struct memcg_register_cache_work *cw =
+ container_of(w, struct memcg_register_cache_work, work);
+ struct mem_cgroup *memcg = cw->memcg;
+ struct kmem_cache *cachep = cw->cachep;
+
+ mutex_lock(&memcg_slab_mutex);
+ memcg_register_cache(memcg, cachep);
+ mutex_unlock(&memcg_slab_mutex);
- cw = container_of(w, struct create_work, work);
- memcg_create_kmem_cache(cw->memcg, cw->cachep);
+ css_put(&memcg->css);
kfree(cw);
}
/*
* Enqueue the creation of a per-memcg kmem_cache.
*/
-static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
+static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
{
- struct create_work *cw;
+ struct memcg_register_cache_work *cw;
- cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
+ cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
if (cw == NULL) {
css_put(&memcg->css);
return;
@@ -3526,17 +3285,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
cw->memcg = memcg;
cw->cachep = cachep;
- INIT_WORK(&cw->work, memcg_create_cache_work_func);
+ INIT_WORK(&cw->work, memcg_register_cache_func);
schedule_work(&cw->work);
}
-static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
+static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
{
/*
* We need to stop accounting when we kmalloc, because if the
* corresponding kmalloc cache is not yet created, the first allocation
- * in __memcg_create_cache_enqueue will recurse.
+ * in __memcg_schedule_register_cache will recurse.
*
* However, it is better to enclose the whole function. Depending on
* the debugging options enabled, INIT_WORK(), for instance, can
@@ -3545,9 +3304,27 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
* the safest choice is to do it like this, wrapping the whole function.
*/
memcg_stop_kmem_account();
- __memcg_create_cache_enqueue(memcg, cachep);
+ __memcg_schedule_register_cache(memcg, cachep);
memcg_resume_kmem_account();
}
+
+int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
+{
+ int res;
+
+ res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
+ PAGE_SIZE << order);
+ if (!res)
+ atomic_add(1 << order, &cachep->memcg_params->nr_pages);
+ return res;
+}
+
+void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
+{
+ memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
+ atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
+}
+
/*
* Return the kmem_cache we're supposed to use for a slab allocation.
* We try to use the current memcg's version of the cache.
@@ -3565,7 +3342,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
gfp_t gfp)
{
struct mem_cgroup *memcg;
- int idx;
+ struct kmem_cache *memcg_cachep;
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@ -3579,20 +3356,14 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
if (!memcg_can_account_kmem(memcg))
goto out;
- idx = memcg_cache_id(memcg);
-
- /*
- * barrier to mare sure we're always seeing the up to date value. The
- * code updating memcg_caches will issue a write barrier to match this.
- */
- read_barrier_depends();
- if (likely(cache_from_memcg_idx(cachep, idx))) {
- cachep = cache_from_memcg_idx(cachep, idx);
+ memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+ if (likely(memcg_cachep)) {
+ cachep = memcg_cachep;
goto out;
}
/* The corresponding put will be done in the workqueue. */
- if (!css_tryget(&memcg->css))
+ if (!css_tryget_online(&memcg->css))
goto out;
rcu_read_unlock();
@@ -3604,22 +3375,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
*
* However, there are some clashes that can arrive from locking.
* For instance, because we acquire the slab_mutex while doing
- * kmem_cache_dup, this means no further allocation could happen
- * with the slab_mutex held.
- *
- * Also, because cache creation issue get_online_cpus(), this
- * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
- * that ends up reversed during cpu hotplug. (cpuset allocates
- * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
- * better to defer everything.
+ * memcg_create_kmem_cache, this means no further allocation
+ * could happen with the slab_mutex held. So it's better to
+ * defer everything.
*/
- memcg_create_cache_enqueue(memcg, cachep);
+ memcg_schedule_register_cache(memcg, cachep);
return cachep;
out:
rcu_read_unlock();
return cachep;
}
-EXPORT_SYMBOL(__memcg_kmem_get_cache);
/*
* We need to verify if the allocation against current->mm->owner's memcg is
@@ -3646,11 +3411,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
/*
* Disabling accounting is only relevant for some specific memcg
* internal allocations. Therefore we would initially not have such
- * check here, since direct calls to the page allocator that are marked
- * with GFP_KMEMCG only happen outside memcg core. We are mostly
- * concerned with cache allocations, and by having this test at
- * memcg_kmem_get_cache, we are already able to relay the allocation to
- * the root cache and bypass the memcg cache altogether.
+ * check here, since direct calls to the page allocator that are
+ * accounted to kmemcg (alloc_kmem_pages and friends) only happen
+ * outside memcg core. We are mostly concerned with cache allocations,
+ * and by having this test at memcg_kmem_get_cache, we are already able
+ * to relay the allocation to the root cache and bypass the memcg cache
+ * altogether.
*
* There is one exception, though: the SLUB allocator does not create
* large order caches, but rather service large kmallocs directly from
@@ -3670,15 +3436,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
if (!current->mm || current->memcg_kmem_skip_account)
return true;
- memcg = try_get_mem_cgroup_from_mm(current->mm);
-
- /*
- * very rare case described in mem_cgroup_from_task. Unfortunately there
- * isn't much we can do without complicating this too much, and it would
- * be gfp-dependent anyway. Just let it go
- */
- if (unlikely(!memcg))
- return true;
+ memcg = get_mem_cgroup_from_mm(current->mm);
if (!memcg_can_account_kmem(memcg)) {
css_put(&memcg->css);
@@ -3741,11 +3499,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
if (!memcg)
return;
- VM_BUG_ON(mem_cgroup_is_root(memcg));
+ VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
}
#else
-static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
{
}
#endif /* CONFIG_MEMCG_KMEM */
@@ -3781,19 +3539,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static inline
-void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
- struct mem_cgroup *to,
- unsigned int nr_pages,
- enum mem_cgroup_stat_index idx)
-{
- /* Update stat data for mem_cgroup */
- preempt_disable();
- __this_cpu_sub(from->stat->count[idx], nr_pages);
- __this_cpu_add(to->stat->count[idx], nr_pages);
- preempt_enable();
-}
-
/**
* mem_cgroup_move_account - move account of the page
* @page: the page
@@ -3820,7 +3565,7 @@ static int mem_cgroup_move_account(struct page *page,
bool anon = PageAnon(page);
VM_BUG_ON(from == to);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
/*
* The page is isolated from LRU. So, collapse function
* will not handle this page. But page splitting can happen.
@@ -3839,13 +3584,19 @@ static int mem_cgroup_move_account(struct page *page,
move_lock_mem_cgroup(from, &flags);
- if (!anon && page_mapped(page))
- mem_cgroup_move_account_page_stat(from, to, nr_pages,
- MEM_CGROUP_STAT_FILE_MAPPED);
+ if (!anon && page_mapped(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ }
- if (PageWriteback(page))
- mem_cgroup_move_account_page_stat(from, to, nr_pages,
- MEM_CGROUP_STAT_WRITEBACK);
+ if (PageWriteback(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ }
mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
@@ -3913,7 +3664,7 @@ static int mem_cgroup_move_parent(struct page *page,
parent = root_mem_cgroup;
if (nr_pages > 1) {
- VM_BUG_ON(!PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
flags = compound_lock_irqsave(page);
}
@@ -3931,23 +3682,23 @@ out:
return ret;
}
-/*
- * Charge the memory controller for page usage.
- * Return
- * 0 if the charge was successful
- * < 0 if the cgroup is over its limit
- */
-static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, enum charge_type ctype)
+int mem_cgroup_charge_anon(struct page *page,
+ struct mm_struct *mm, gfp_t gfp_mask)
{
- struct mem_cgroup *memcg = NULL;
unsigned int nr_pages = 1;
+ struct mem_cgroup *memcg;
bool oom = true;
- int ret;
+
+ if (mem_cgroup_disabled())
+ return 0;
+
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
+ VM_BUG_ON(!mm);
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
- VM_BUG_ON(!PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/*
* Never OOM-kill a process for a huge page. The
* fault handler will fall back to regular pages.
@@ -3955,25 +3706,14 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
oom = false;
}
- ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
- if (ret == -ENOMEM)
- return ret;
- __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
+ if (!memcg)
+ return -ENOMEM;
+ __mem_cgroup_commit_charge(memcg, page, nr_pages,
+ MEM_CGROUP_CHARGE_TYPE_ANON, false);
return 0;
}
-int mem_cgroup_newpage_charge(struct page *page,
- struct mm_struct *mm, gfp_t gfp_mask)
-{
- if (mem_cgroup_disabled())
- return 0;
- VM_BUG_ON(page_mapped(page));
- VM_BUG_ON(page->mapping && !PageAnon(page));
- VM_BUG_ON(!mm);
- return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_ANON);
-}
-
/*
* While swap-in, try_charge -> commit or cancel, the page is locked.
* And when try_charge() successfully returns, one refcnt to memcg without
@@ -3985,7 +3725,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
gfp_t mask,
struct mem_cgroup **memcgp)
{
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = NULL;
struct page_cgroup *pc;
int ret;
@@ -3998,31 +3738,29 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
* in turn serializes uncharging.
*/
if (PageCgroupUsed(pc))
- return 0;
- if (!do_swap_account)
- goto charge_cur_mm;
- memcg = try_get_mem_cgroup_from_page(page);
+ goto out;
+ if (do_swap_account)
+ memcg = try_get_mem_cgroup_from_page(page);
if (!memcg)
- goto charge_cur_mm;
- *memcgp = memcg;
- ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
+ memcg = get_mem_cgroup_from_mm(mm);
+ ret = mem_cgroup_try_charge(memcg, mask, 1, true);
css_put(&memcg->css);
if (ret == -EINTR)
- ret = 0;
- return ret;
-charge_cur_mm:
- ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
- if (ret == -EINTR)
- ret = 0;
- return ret;
+ memcg = root_mem_cgroup;
+ else if (ret)
+ return ret;
+out:
+ *memcgp = memcg;
+ return 0;
}
int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
gfp_t gfp_mask, struct mem_cgroup **memcgp)
{
- *memcgp = NULL;
- if (mem_cgroup_disabled())
+ if (mem_cgroup_disabled()) {
+ *memcgp = NULL;
return 0;
+ }
/*
* A racing thread's fault, or swapoff, may have already
* updated the pte, and even removed page from swap cache: in
@@ -4030,12 +3768,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
* there's also a KSM case which does need to charge the page.
*/
if (!PageSwapCache(page)) {
- int ret;
+ struct mem_cgroup *memcg;
- ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
- if (ret == -EINTR)
- ret = 0;
- return ret;
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+ if (!memcg)
+ return -ENOMEM;
+ *memcgp = memcg;
+ return 0;
}
return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
}
@@ -4079,11 +3818,11 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
MEM_CGROUP_CHARGE_TYPE_ANON);
}
-int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
- struct mem_cgroup *memcg = NULL;
enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+ struct mem_cgroup *memcg;
int ret;
if (mem_cgroup_disabled())
@@ -4091,15 +3830,20 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
if (PageCompound(page))
return 0;
- if (!PageSwapCache(page))
- ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
- else { /* page is swapcache/shmem */
+ if (PageSwapCache(page)) { /* shmem */
ret = __mem_cgroup_try_charge_swapin(mm, page,
gfp_mask, &memcg);
- if (!ret)
- __mem_cgroup_commit_charge_swapin(page, memcg, type);
+ if (ret)
+ return ret;
+ __mem_cgroup_commit_charge_swapin(page, memcg, type);
+ return 0;
}
- return ret;
+
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+ if (!memcg)
+ return -ENOMEM;
+ __mem_cgroup_commit_charge(memcg, page, 1, type, false);
+ return 0;
}
static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -4172,7 +3916,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
- VM_BUG_ON(!PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
}
/*
* Check if our page_cgroup is valid
@@ -4264,7 +4008,7 @@ void mem_cgroup_uncharge_page(struct page *page)
/* early check. */
if (page_mapped(page))
return;
- VM_BUG_ON(page->mapping && !PageAnon(page));
+ VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
/*
* If the page is in swap cache, uncharge should be deferred
* to the swap path, which also properly accounts swap usage
@@ -4284,8 +4028,8 @@ void mem_cgroup_uncharge_page(struct page *page)
void mem_cgroup_uncharge_cache_page(struct page *page)
{
- VM_BUG_ON(page_mapped(page));
- VM_BUG_ON(page->mapping);
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ VM_BUG_ON_PAGE(page->mapping, page);
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
}
@@ -4379,8 +4123,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
memcg = mem_cgroup_lookup(id);
if (memcg) {
/*
- * We uncharge this because swap is freed.
- * This memcg can be obsolete one. We avoid calling css_tryget
+ * We uncharge this because swap is freed. This memcg can
+ * be obsolete one. We avoid calling css_tryget_online().
*/
if (!mem_cgroup_is_root(memcg))
res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
@@ -4834,7 +4578,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
break;
} while (1);
}
- __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
+ __mem_cgroup_remove_exceeded(mz, mctz);
excess = res_counter_soft_limit_excess(&mz->memcg->res);
/*
* One school of thought says that we should not add
@@ -4845,7 +4589,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
* term TODO.
*/
/* If excess == 0, no tree ops */
- __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
+ __mem_cgroup_insert_exceeded(mz, mctz, excess);
spin_unlock(&mctz->lock);
css_put(&mz->memcg->css);
loop++;
@@ -4912,9 +4656,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
if (mem_cgroup_move_parent(page, pc, memcg)) {
/* found lock contention or "pc" is obsolete. */
busy = page;
- cond_resched();
} else
busy = NULL;
+ cond_resched();
} while (!list_empty(list));
}
@@ -4965,18 +4709,28 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
} while (usage > 0);
}
+/*
+ * Test whether @memcg has children, dead or alive. Note that this
+ * function doesn't care whether @memcg has use_hierarchy enabled and
+ * returns %true if there are child csses according to the cgroup
+ * hierarchy. Testing use_hierarchy is the caller's responsiblity.
+ */
static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
- lockdep_assert_held(&memcg_create_mutex);
+ bool ret;
+
/*
- * The lock does not prevent addition or deletion to the list
- * of children, but it prevents a new child from being
- * initialized based on this parent in css_online(), so it's
- * enough to decide whether hierarchically inherited
- * attributes can still be changed or not.
+ * The lock does not prevent addition or deletion of children, but
+ * it prevents a new child from being initialized based on this
+ * parent in css_online(), so it's enough to decide whether
+ * hierarchically inherited attributes can still be changed or not.
*/
- return memcg->use_hierarchy &&
- !list_empty(&memcg->css.cgroup->children);
+ lockdep_assert_held(&memcg_create_mutex);
+
+ rcu_read_lock();
+ ret = css_next_child(NULL, &memcg->css);
+ rcu_read_unlock();
+ return ret;
}
/*
@@ -4988,11 +4742,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
{
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct cgroup *cgrp = memcg->css.cgroup;
-
- /* returns EBUSY if there is a task or if we come here twice. */
- if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
- return -EBUSY;
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
@@ -5012,20 +4761,19 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
}
}
- lru_add_drain();
- mem_cgroup_reparent_charges(memcg);
return 0;
}
-static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
- unsigned int event)
+static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
if (mem_cgroup_is_root(memcg))
return -EINVAL;
- return mem_cgroup_force_empty(memcg);
+ return mem_cgroup_force_empty(memcg) ?: nbytes;
}
static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
@@ -5039,7 +4787,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
{
int retval = 0;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css));
+ struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
mutex_lock(&memcg_create_mutex);
@@ -5056,7 +4804,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
*/
if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
(val == 1 || val == 0)) {
- if (list_empty(&memcg->css.cgroup->children))
+ if (!memcg_has_children(memcg))
memcg->use_hierarchy = val;
else
retval = -EBUSY;
@@ -5109,14 +4857,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
return val << PAGE_SHIFT;
}
-static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct file *file,
- char __user *buf, size_t nbytes, loff_t *ppos)
+static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- char str[64];
u64 val;
- int name, len;
+ int name;
enum res_type type;
type = MEMFILE_TYPE(cft->private);
@@ -5142,15 +4888,26 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
BUG();
}
- len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
- return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+ return val;
}
-static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
-{
- int ret = -EINVAL;
#ifdef CONFIG_MEMCG_KMEM
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+/* should be called with activate_kmem_mutex held */
+static int __memcg_activate_kmem(struct mem_cgroup *memcg,
+ unsigned long long limit)
+{
+ int err = 0;
+ int memcg_id;
+
+ if (memcg_kmem_is_active(memcg))
+ return 0;
+
+ /*
+ * We are going to allocate memory for data shared by all memory
+ * cgroups so let's stop accounting here.
+ */
+ memcg_stop_kmem_account();
+
/*
* For simplicity, we won't allow this to be disabled. It also can't
* be changed if the cgroup has children already, or if tasks had
@@ -5164,89 +4921,121 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
* of course permitted.
*/
mutex_lock(&memcg_create_mutex);
- mutex_lock(&set_limit_mutex);
- if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
- if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
- ret = -EBUSY;
- goto out;
- }
- ret = res_counter_set_limit(&memcg->kmem, val);
- VM_BUG_ON(ret);
+ if (cgroup_has_tasks(memcg->css.cgroup) ||
+ (memcg->use_hierarchy && memcg_has_children(memcg)))
+ err = -EBUSY;
+ mutex_unlock(&memcg_create_mutex);
+ if (err)
+ goto out;
- ret = memcg_update_cache_sizes(memcg);
- if (ret) {
- res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
- goto out;
- }
- static_key_slow_inc(&memcg_kmem_enabled_key);
- /*
- * setting the active bit after the inc will guarantee no one
- * starts accounting before all call sites are patched
- */
- memcg_kmem_set_active(memcg);
- } else
- ret = res_counter_set_limit(&memcg->kmem, val);
+ memcg_id = ida_simple_get(&kmem_limited_groups,
+ 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+ if (memcg_id < 0) {
+ err = memcg_id;
+ goto out;
+ }
+
+ /*
+ * Make sure we have enough space for this cgroup in each root cache's
+ * memcg_params.
+ */
+ mutex_lock(&memcg_slab_mutex);
+ err = memcg_update_all_caches(memcg_id + 1);
+ mutex_unlock(&memcg_slab_mutex);
+ if (err)
+ goto out_rmid;
+
+ memcg->kmemcg_id = memcg_id;
+ INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+
+ /*
+ * We couldn't have accounted to this cgroup, because it hasn't got the
+ * active bit set yet, so this should succeed.
+ */
+ err = res_counter_set_limit(&memcg->kmem, limit);
+ VM_BUG_ON(err);
+
+ static_key_slow_inc(&memcg_kmem_enabled_key);
+ /*
+ * Setting the active bit after enabling static branching will
+ * guarantee no one starts accounting before all call sites are
+ * patched.
+ */
+ memcg_kmem_set_active(memcg);
out:
- mutex_unlock(&set_limit_mutex);
- mutex_unlock(&memcg_create_mutex);
-#endif
+ memcg_resume_kmem_account();
+ return err;
+
+out_rmid:
+ ida_simple_remove(&kmem_limited_groups, memcg_id);
+ goto out;
+}
+
+static int memcg_activate_kmem(struct mem_cgroup *memcg,
+ unsigned long long limit)
+{
+ int ret;
+
+ mutex_lock(&activate_kmem_mutex);
+ ret = __memcg_activate_kmem(memcg, limit);
+ mutex_unlock(&activate_kmem_mutex);
+ return ret;
+}
+
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ int ret;
+
+ if (!memcg_kmem_is_active(memcg))
+ ret = memcg_activate_kmem(memcg, val);
+ else
+ ret = res_counter_set_limit(&memcg->kmem, val);
return ret;
}
-#ifdef CONFIG_MEMCG_KMEM
static int memcg_propagate_kmem(struct mem_cgroup *memcg)
{
int ret = 0;
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- if (!parent)
- goto out;
- memcg->kmem_account_flags = parent->kmem_account_flags;
- /*
- * When that happen, we need to disable the static branch only on those
- * memcgs that enabled it. To achieve this, we would be forced to
- * complicate the code by keeping track of which memcgs were the ones
- * that actually enabled limits, and which ones got it from its
- * parents.
- *
- * It is a lot simpler just to do static_key_slow_inc() on every child
- * that is accounted.
- */
- if (!memcg_kmem_is_active(memcg))
- goto out;
+ if (!parent)
+ return 0;
+ mutex_lock(&activate_kmem_mutex);
/*
- * __mem_cgroup_free() will issue static_key_slow_dec() because this
- * memcg is active already. If the later initialization fails then the
- * cgroup core triggers the cleanup so we do not have to do it here.
+ * If the parent cgroup is not kmem-active now, it cannot be activated
+ * after this point, because it has at least one child already.
*/
- static_key_slow_inc(&memcg_kmem_enabled_key);
-
- mutex_lock(&set_limit_mutex);
- memcg_stop_kmem_account();
- ret = memcg_update_cache_sizes(memcg);
- memcg_resume_kmem_account();
- mutex_unlock(&set_limit_mutex);
-out:
+ if (memcg_kmem_is_active(parent))
+ ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
+ mutex_unlock(&activate_kmem_mutex);
return ret;
}
+#else
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+ unsigned long long val)
+{
+ return -EINVAL;
+}
#endif /* CONFIG_MEMCG_KMEM */
/*
* The user of this function is...
* RES_LIMIT.
*/
-static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
- const char *buffer)
+static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
enum res_type type;
int name;
unsigned long long val;
int ret;
- type = MEMFILE_TYPE(cft->private);
- name = MEMFILE_ATTR(cft->private);
+ buf = strstrip(buf);
+ type = MEMFILE_TYPE(of_cft(of)->private);
+ name = MEMFILE_ATTR(of_cft(of)->private);
switch (name) {
case RES_LIMIT:
@@ -5255,7 +5044,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
/* This function does all necessary parse...reuse it */
- ret = res_counter_memparse_write_strategy(buffer, &val);
+ ret = res_counter_memparse_write_strategy(buf, &val);
if (ret)
break;
if (type == _MEM)
@@ -5263,12 +5052,12 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
else if (type == _MEMSWAP)
ret = mem_cgroup_resize_memsw_limit(memcg, val);
else if (type == _KMEM)
- ret = memcg_update_kmem_limit(css, val);
+ ret = memcg_update_kmem_limit(memcg, val);
else
return -EINVAL;
break;
case RES_SOFT_LIMIT:
- ret = res_counter_memparse_write_strategy(buffer, &val);
+ ret = res_counter_memparse_write_strategy(buf, &val);
if (ret)
break;
/*
@@ -5285,7 +5074,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
ret = -EINVAL; /* should be BUG() ? */
break;
}
- return ret;
+ return ret ?: nbytes;
}
static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
@@ -5298,8 +5087,8 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
if (!memcg->use_hierarchy)
goto out;
- while (css_parent(&memcg->css)) {
- memcg = mem_cgroup_from_css(css_parent(&memcg->css));
+ while (memcg->css.parent) {
+ memcg = mem_cgroup_from_css(memcg->css.parent);
if (!memcg->use_hierarchy)
break;
tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -5312,14 +5101,15 @@ out:
*memsw_limit = min_memsw_limit;
}
-static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
+static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
int name;
enum res_type type;
- type = MEMFILE_TYPE(event);
- name = MEMFILE_ATTR(event);
+ type = MEMFILE_TYPE(of_cft(of)->private);
+ name = MEMFILE_ATTR(of_cft(of)->private);
switch (name) {
case RES_MAX_USAGE:
@@ -5344,7 +5134,7 @@ static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
break;
}
- return 0;
+ return nbytes;
}
static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
@@ -5380,8 +5170,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
#endif
#ifdef CONFIG_NUMA
-static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
- struct cftype *cft, struct seq_file *m)
+static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
struct numa_stat {
const char *name;
@@ -5397,7 +5186,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
const struct numa_stat *stat;
int nid;
unsigned long nr;
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
@@ -5436,10 +5225,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
}
-static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
- struct seq_file *m)
+static int memcg_stat_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
struct mem_cgroup *mi;
unsigned int i;
@@ -5505,7 +5293,7 @@ static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
for_each_online_node(nid)
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
- mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+ mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
rstat = &mz->lruvec.reclaim_stat;
recent_rotated[0] += rstat->recent_rotated[0];
@@ -5535,22 +5323,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
-
- if (val > 100 || !parent)
- return -EINVAL;
-
- mutex_lock(&memcg_create_mutex);
- /* If under hierarchy, only empty-root can set this value */
- if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
- mutex_unlock(&memcg_create_mutex);
+ if (val > 100)
return -EINVAL;
- }
-
- memcg->swappiness = val;
- mutex_unlock(&memcg_create_mutex);
+ if (css->parent)
+ memcg->swappiness = val;
+ else
+ vm_swappiness = val;
return 0;
}
@@ -5635,8 +5415,12 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
{
struct mem_cgroup_eventfd_list *ev;
+ spin_lock(&memcg_oom_lock);
+
list_for_each_entry(ev, &memcg->oom_notify, list)
eventfd_signal(ev->eventfd, 1);
+
+ spin_unlock(&memcg_oom_lock);
return 0;
}
@@ -5648,13 +5432,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
mem_cgroup_oom_notify_cb(iter);
}
-static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args, enum res_type type)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- enum res_type type = MEMFILE_TYPE(cft->private);
u64 threshold, usage;
int i, size, ret;
@@ -5731,13 +5513,23 @@ unlock:
return ret;
}
-static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd)
+static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+}
+
+static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
+{
+ return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+}
+
+static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, enum res_type type)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- enum res_type type = MEMFILE_TYPE(cft->private);
u64 usage;
int i, j, size;
@@ -5810,14 +5602,23 @@ unlock:
mutex_unlock(&memcg->thresholds_lock);
}
-static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+}
+
+static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
+{
+ return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+}
+
+static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd, const char *args)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_eventfd_list *event;
- enum res_type type = MEMFILE_TYPE(cft->private);
- BUG_ON(type != _OOM_TYPE);
event = kmalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
@@ -5835,14 +5636,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
return 0;
}
-static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
- struct cftype *cft, struct eventfd_ctx *eventfd)
+static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
+ struct eventfd_ctx *eventfd)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup_eventfd_list *ev, *tmp;
- enum res_type type = MEMFILE_TYPE(cft->private);
-
- BUG_ON(type != _OOM_TYPE);
spin_lock(&memcg_oom_lock);
@@ -5856,17 +5653,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
spin_unlock(&memcg_oom_lock);
}
-static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
- struct cftype *cft, struct cgroup_map_cb *cb)
+static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
- cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
-
- if (atomic_read(&memcg->under_oom))
- cb->fill(cb, "under_oom", 1);
- else
- cb->fill(cb, "under_oom", 0);
+ seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
+ seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
return 0;
}
@@ -5874,22 +5666,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
/* cannot set to root cgroup and only 0 and 1 are allowed */
- if (!parent || !((val == 0) || (val == 1)))
+ if (!css->parent || !((val == 0) || (val == 1)))
return -EINVAL;
- mutex_lock(&memcg_create_mutex);
- /* oom-kill-disable is a flag for subhierarchy. */
- if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
- mutex_unlock(&memcg_create_mutex);
- return -EINVAL;
- }
memcg->oom_kill_disable = val;
if (!val)
memcg_oom_recover(memcg);
- mutex_unlock(&memcg_create_mutex);
+
return 0;
}
@@ -5929,10 +5714,10 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
* which is then paired with css_put during uncharge resp. here.
*
* Although this might sound strange as this path is called from
- * css_offline() when the referencemight have dropped down to 0
- * and shouldn't be incremented anymore (css_tryget would fail)
- * we do not have other options because of the kmem allocations
- * lifetime.
+ * css_offline() when the referencemight have dropped down to 0 and
+ * shouldn't be incremented anymore (css_tryget_online() would
+ * fail) we do not have other options because of the kmem
+ * allocations lifetime.
*/
css_get(&memcg->css);
@@ -5959,45 +5744,266 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
}
#endif
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * "cgroup.event_control" implementation.
+ *
+ * This is way over-engineered. It tries to support fully configurable
+ * events for each user. Such level of flexibility is completely
+ * unnecessary especially in the light of the planned unified hierarchy.
+ *
+ * Please deprecate this and replace with something simpler if at all
+ * possible.
+ */
+
+/*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void memcg_event_remove(struct work_struct *work)
+{
+ struct mem_cgroup_event *event =
+ container_of(work, struct mem_cgroup_event, remove);
+ struct mem_cgroup *memcg = event->memcg;
+
+ remove_wait_queue(event->wqh, &event->wait);
+
+ event->unregister_event(memcg, event->eventfd);
+
+ /* Notify userspace the event is going away. */
+ eventfd_signal(event->eventfd, 1);
+
+ eventfd_ctx_put(event->eventfd);
+ kfree(event);
+ css_put(&memcg->css);
+}
+
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct mem_cgroup_event *event =
+ container_of(wait, struct mem_cgroup_event, wait);
+ struct mem_cgroup *memcg = event->memcg;
+ unsigned long flags = (unsigned long)key;
+
+ if (flags & POLLHUP) {
+ /*
+ * If the event has been detached at cgroup removal, we
+ * can simply return knowing the other side will cleanup
+ * for us.
+ *
+ * We can't race against event freeing since the other
+ * side will require wqh->lock via remove_wait_queue(),
+ * which we hold.
+ */
+ spin_lock(&memcg->event_list_lock);
+ if (!list_empty(&event->list)) {
+ list_del_init(&event->list);
+ /*
+ * We are in atomic context, but cgroup_event_remove()
+ * may sleep, so we have to call it in workqueue.
+ */
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&memcg->event_list_lock);
+ }
+
+ return 0;
+}
+
+static void memcg_event_ptable_queue_proc(struct file *file,
+ wait_queue_head_t *wqh, poll_table *pt)
+{
+ struct mem_cgroup_event *event =
+ container_of(pt, struct mem_cgroup_event, pt);
+
+ event->wqh = wqh;
+ add_wait_queue(wqh, &event->wait);
+}
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup_subsys_state *css = of_css(of);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup_event *event;
+ struct cgroup_subsys_state *cfile_css;
+ unsigned int efd, cfd;
+ struct fd efile;
+ struct fd cfile;
+ const char *name;
+ char *endp;
+ int ret;
+
+ buf = strstrip(buf);
+
+ efd = simple_strtoul(buf, &endp, 10);
+ if (*endp != ' ')
+ return -EINVAL;
+ buf = endp + 1;
+
+ cfd = simple_strtoul(buf, &endp, 10);
+ if ((*endp != ' ') && (*endp != '\0'))
+ return -EINVAL;
+ buf = endp + 1;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ event->memcg = memcg;
+ INIT_LIST_HEAD(&event->list);
+ init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+ init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+ INIT_WORK(&event->remove, memcg_event_remove);
+
+ efile = fdget(efd);
+ if (!efile.file) {
+ ret = -EBADF;
+ goto out_kfree;
+ }
+
+ event->eventfd = eventfd_ctx_fileget(efile.file);
+ if (IS_ERR(event->eventfd)) {
+ ret = PTR_ERR(event->eventfd);
+ goto out_put_efile;
+ }
+
+ cfile = fdget(cfd);
+ if (!cfile.file) {
+ ret = -EBADF;
+ goto out_put_eventfd;
+ }
+
+ /* the process need read permission on control file */
+ /* AV: shouldn't we check that it's been opened for read instead? */
+ ret = inode_permission(file_inode(cfile.file), MAY_READ);
+ if (ret < 0)
+ goto out_put_cfile;
+
+ /*
+ * Determine the event callbacks and set them in @event. This used
+ * to be done via struct cftype but cgroup core no longer knows
+ * about these events. The following is crude but the whole thing
+ * is for compatibility anyway.
+ *
+ * DO NOT ADD NEW FILES.
+ */
+ name = cfile.file->f_dentry->d_name.name;
+
+ if (!strcmp(name, "memory.usage_in_bytes")) {
+ event->register_event = mem_cgroup_usage_register_event;
+ event->unregister_event = mem_cgroup_usage_unregister_event;
+ } else if (!strcmp(name, "memory.oom_control")) {
+ event->register_event = mem_cgroup_oom_register_event;
+ event->unregister_event = mem_cgroup_oom_unregister_event;
+ } else if (!strcmp(name, "memory.pressure_level")) {
+ event->register_event = vmpressure_register_event;
+ event->unregister_event = vmpressure_unregister_event;
+ } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+ event->register_event = memsw_cgroup_usage_register_event;
+ event->unregister_event = memsw_cgroup_usage_unregister_event;
+ } else {
+ ret = -EINVAL;
+ goto out_put_cfile;
+ }
+
+ /*
+ * Verify @cfile should belong to @css. Also, remaining events are
+ * automatically removed on cgroup destruction but the removal is
+ * asynchronous, so take an extra ref on @css.
+ */
+ cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent,
+ &memory_cgrp_subsys);
+ ret = -EINVAL;
+ if (IS_ERR(cfile_css))
+ goto out_put_cfile;
+ if (cfile_css != css) {
+ css_put(cfile_css);
+ goto out_put_cfile;
+ }
+
+ ret = event->register_event(memcg, event->eventfd, buf);
+ if (ret)
+ goto out_put_css;
+
+ efile.file->f_op->poll(efile.file, &event->pt);
+
+ spin_lock(&memcg->event_list_lock);
+ list_add(&event->list, &memcg->event_list);
+ spin_unlock(&memcg->event_list_lock);
+
+ fdput(cfile);
+ fdput(efile);
+
+ return nbytes;
+
+out_put_css:
+ css_put(css);
+out_put_cfile:
+ fdput(cfile);
+out_put_eventfd:
+ eventfd_ctx_put(event->eventfd);
+out_put_efile:
+ fdput(efile);
+out_kfree:
+ kfree(event);
+
+ return ret;
+}
+
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
- .read = mem_cgroup_read,
- .register_event = mem_cgroup_usage_register_event,
- .unregister_event = mem_cgroup_usage_unregister_event,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
- .trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
- .write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "soft_limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
- .write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "failcnt",
.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
- .trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "stat",
- .read_seq_string = memcg_stat_show,
+ .seq_show = memcg_stat_show,
},
{
.name = "force_empty",
- .trigger = mem_cgroup_force_empty_write,
+ .write = mem_cgroup_force_empty_write,
},
{
.name = "use_hierarchy",
@@ -6006,6 +6012,12 @@ static struct cftype mem_cgroup_files[] = {
.read_u64 = mem_cgroup_hierarchy_read,
},
{
+ .name = "cgroup.event_control", /* XXX: for compat */
+ .write = memcg_write_event_control,
+ .flags = CFTYPE_NO_PREFIX,
+ .mode = S_IWUGO,
+ },
+ {
.name = "swappiness",
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
@@ -6017,51 +6029,47 @@ static struct cftype mem_cgroup_files[] = {
},
{
.name = "oom_control",
- .read_map = mem_cgroup_oom_control_read,
+ .seq_show = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
- .register_event = mem_cgroup_oom_register_event,
- .unregister_event = mem_cgroup_oom_unregister_event,
.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
{
.name = "pressure_level",
- .register_event = vmpressure_register_event,
- .unregister_event = vmpressure_unregister_event,
},
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
- .read_seq_string = memcg_numa_stat_show,
+ .seq_show = memcg_numa_stat_show,
},
#endif
#ifdef CONFIG_MEMCG_KMEM
{
.name = "kmem.limit_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
- .write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
- .read = mem_cgroup_read,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.failcnt",
.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
- .trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "kmem.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
- .trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
},
#ifdef CONFIG_SLABINFO
{
.name = "kmem.slabinfo",
- .read_seq_string = mem_cgroup_slabinfo_read,
+ .seq_show = mem_cgroup_slabinfo_read,
},
#endif
#endif
@@ -6073,27 +6081,25 @@ static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
- .read = mem_cgroup_read,
- .register_event = mem_cgroup_usage_register_event,
- .unregister_event = mem_cgroup_usage_unregister_event,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.max_usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
- .trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.limit_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
- .write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
},
{
.name = "memsw.failcnt",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
- .trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
},
{ }, /* terminate */
};
@@ -6136,14 +6142,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- size_t size = memcg_size();
+ size_t size;
- /* Can be very big if nr_node_ids is very big */
- if (size < PAGE_SIZE)
- memcg = kzalloc(size, GFP_KERNEL);
- else
- memcg = vzalloc(size);
+ size = sizeof(struct mem_cgroup);
+ size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
+ memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
return NULL;
@@ -6154,10 +6158,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
return memcg;
out_free:
- if (size < PAGE_SIZE)
- kfree(memcg);
- else
- vfree(memcg);
+ kfree(memcg);
return NULL;
}
@@ -6175,7 +6176,6 @@ out_free:
static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- size_t size = memcg_size();
mem_cgroup_remove_from_trees(memcg);
@@ -6196,10 +6196,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
* the cgroup_lock.
*/
disarm_static_keys(memcg);
- if (size < PAGE_SIZE)
- kfree(memcg);
- else
- vfree(memcg);
+ kfree(memcg);
}
/*
@@ -6265,6 +6262,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
+ INIT_LIST_HEAD(&memcg->event_list);
+ spin_lock_init(&memcg->event_list_lock);
return &memcg->css;
@@ -6277,10 +6276,9 @@ static int
mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
- int error = 0;
+ struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
- if (css->cgroup->id > MEM_CGROUP_ID_MAX)
+ if (css->id > MEM_CGROUP_ID_MAX)
return -ENOSPC;
if (!parent)
@@ -6311,12 +6309,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
* unfortunate state in our controller.
*/
if (parent != root_mem_cgroup)
- mem_cgroup_subsys.broken_hierarchy = true;
+ memory_cgrp_subsys.broken_hierarchy = true;
}
-
- error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
mutex_unlock(&memcg_create_mutex);
- return error;
+
+ return memcg_init_kmem(memcg, &memory_cgrp_subsys);
}
/*
@@ -6340,18 +6337,75 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup_event *event, *tmp;
+ struct cgroup_subsys_state *iter;
+
+ /*
+ * Unregister events and notify userspace.
+ * Notify userspace about cgroup removing only after rmdir of cgroup
+ * directory to avoid race between userspace and kernelspace.
+ */
+ spin_lock(&memcg->event_list_lock);
+ list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+ list_del_init(&event->list);
+ schedule_work(&event->remove);
+ }
+ spin_unlock(&memcg->event_list_lock);
kmem_cgroup_css_offline(memcg);
mem_cgroup_invalidate_reclaim_iterators(memcg);
- mem_cgroup_reparent_charges(memcg);
- mem_cgroup_destroy_all_caches(memcg);
+
+ /*
+ * This requires that offlining is serialized. Right now that is
+ * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
+ */
+ css_for_each_descendant_post(iter, css)
+ mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
+
+ memcg_unregister_all_caches(memcg);
vmpressure_cleanup(&memcg->vmpressure);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ /*
+ * XXX: css_offline() would be where we should reparent all
+ * memory to prepare the cgroup for destruction. However,
+ * memcg does not do css_tryget_online() and res_counter charging
+ * under the same RCU lock region, which means that charging
+ * could race with offlining. Offlining only happens to
+ * cgroups with no tasks in them but charges can show up
+ * without any tasks from the swapin path when the target
+ * memcg is looked up from the swapout record and not from the
+ * current task as it usually is. A race like this can leak
+ * charges and put pages with stale cgroup pointers into
+ * circulation:
+ *
+ * #0 #1
+ * lookup_swap_cgroup_id()
+ * rcu_read_lock()
+ * mem_cgroup_lookup()
+ * css_tryget_online()
+ * rcu_read_unlock()
+ * disable css_tryget_online()
+ * call_rcu()
+ * offline_css()
+ * reparent_charges()
+ * res_counter_charge()
+ * css_put()
+ * css_free()
+ * pc->mem_cgroup = dead memcg
+ * add page to lru
+ *
+ * The bulk of the charges are still moved in offline_css() to
+ * avoid pinning a lot of pages in case a long-term reference
+ * like a swapout record is deferring the css_free() to long
+ * after offlining. But this makes sure we catch any charges
+ * made after offlining:
+ */
+ mem_cgroup_reparent_charges(memcg);
memcg_destroy_kmem(memcg);
__mem_cgroup_free(memcg);
@@ -6401,8 +6455,7 @@ one_by_one:
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
- ret = __mem_cgroup_try_charge(NULL,
- GFP_KERNEL, 1, &memcg, false);
+ ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
if (ret)
/* mem_cgroup_clear_mc() will do uncharge later */
return ret;
@@ -6506,16 +6559,20 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
pgoff = pte_to_pgoff(ptent);
/* page is moved even if it's not RSS of this task(page-faulted). */
- page = find_get_page(mapping, pgoff);
-
#ifdef CONFIG_SWAP
/* shmem/tmpfs may report page out on swap: account for that too. */
- if (radix_tree_exceptional_entry(page)) {
- swp_entry_t swap = radix_to_swp_entry(page);
- if (do_swap_account)
- *entry = swap;
- page = find_get_page(swap_address_space(swap), swap.val);
- }
+ if (shmem_mapping(mapping)) {
+ page = find_get_entry(mapping, pgoff);
+ if (radix_tree_exceptional_entry(page)) {
+ swp_entry_t swp = radix_to_swp_entry(page);
+ if (do_swap_account)
+ *entry = swp;
+ page = find_get_page(swap_address_space(swp), swp.val);
+ }
+ } else
+ page = find_get_page(mapping, pgoff);
+#else
+ page = find_get_page(mapping, pgoff);
#endif
return page;
}
@@ -6576,7 +6633,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
enum mc_target_type ret = MC_TARGET_NONE;
page = pmd_page(pmd);
- VM_BUG_ON(!page || !PageHead(page));
+ VM_BUG_ON_PAGE(!page || !PageHead(page), page);
if (!move_anon())
return ret;
pc = lookup_page_cgroup(page);
@@ -6961,9 +7018,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
mem_cgroup_from_css(root_css)->use_hierarchy = true;
}
-struct cgroup_subsys mem_cgroup_subsys = {
- .name = "memory",
- .subsys_id = mem_cgroup_subsys_id,
+struct cgroup_subsys memory_cgrp_subsys = {
.css_alloc = mem_cgroup_css_alloc,
.css_online = mem_cgroup_css_online,
.css_offline = mem_cgroup_css_offline,
@@ -6989,7 +7044,7 @@ __setup("swapaccount=", enable_swap_account);
static void __init memsw_file_init(void)
{
- WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
+ WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
}
static void __init enable_swap_cgroup(void)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b7c171602ba..a013bc94ebb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -145,14 +145,10 @@ static int hwpoison_filter_task(struct page *p)
return -EINVAL;
css = mem_cgroup_css(mem);
- /* root_mem_cgroup has NULL dentries */
- if (!css->cgroup->dentry)
- return -EINVAL;
-
- ino = css->cgroup->dentry->d_inode->i_ino;
+ ino = cgroup_ino(css->cgroup);
css_put(css);
- if (ino != hwpoison_filter_memcg)
+ if (!ino || ino != hwpoison_filter_memcg)
return -EINVAL;
return 0;
@@ -208,9 +204,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
#endif
si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
- if ((flags & MF_ACTION_REQUIRED) && t == current) {
+ if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
si.si_code = BUS_MCEERR_AR;
- ret = force_sig_info(SIGBUS, &si, t);
+ ret = force_sig_info(SIGBUS, &si, current);
} else {
/*
* Don't use force here, it's convenient if the signal
@@ -384,20 +380,51 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
}
}
-static int task_early_kill(struct task_struct *tsk)
+/*
+ * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
+ * on behalf of the thread group. Return task_struct of the (first found)
+ * dedicated thread if found, and return NULL otherwise.
+ *
+ * We already hold read_lock(&tasklist_lock) in the caller, so we don't
+ * have to call rcu_read_lock/unlock() in this function.
+ */
+static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
{
+ struct task_struct *t;
+
+ for_each_thread(tsk, t)
+ if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
+ return t;
+ return NULL;
+}
+
+/*
+ * Determine whether a given process is "early kill" process which expects
+ * to be signaled when some page under the process is hwpoisoned.
+ * Return task_struct of the dedicated thread (main thread unless explicitly
+ * specified) if the process is "early kill," and otherwise returns NULL.
+ */
+static struct task_struct *task_early_kill(struct task_struct *tsk,
+ int force_early)
+{
+ struct task_struct *t;
if (!tsk->mm)
- return 0;
- if (tsk->flags & PF_MCE_PROCESS)
- return !!(tsk->flags & PF_MCE_EARLY);
- return sysctl_memory_failure_early_kill;
+ return NULL;
+ if (force_early)
+ return tsk;
+ t = find_early_kill_thread(tsk);
+ if (t)
+ return t;
+ if (sysctl_memory_failure_early_kill)
+ return tsk;
+ return NULL;
}
/*
* Collect processes when the error hit an anonymous page.
*/
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc)
+ struct to_kill **tkc, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -408,20 +435,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (av == NULL) /* Not actually mapped anymore */
return;
- pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ pgoff = page_to_pgoff(page);
read_lock(&tasklist_lock);
for_each_process (tsk) {
struct anon_vma_chain *vmac;
+ struct task_struct *t = task_early_kill(tsk, force_early);
- if (!task_early_kill(tsk))
+ if (!t)
continue;
anon_vma_interval_tree_foreach(vmac, &av->rb_root,
pgoff, pgoff) {
vma = vmac->vma;
if (!page_mapped_in_vma(page, vma))
continue;
- if (vma->vm_mm == tsk->mm)
- add_to_kill(tsk, page, vma, to_kill, tkc);
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, vma, to_kill, tkc);
}
}
read_unlock(&tasklist_lock);
@@ -432,7 +460,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
* Collect processes when the error hit a file mapped page.
*/
static void collect_procs_file(struct page *page, struct list_head *to_kill,
- struct to_kill **tkc)
+ struct to_kill **tkc, int force_early)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
@@ -441,11 +469,11 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
mutex_lock(&mapping->i_mmap_mutex);
read_lock(&tasklist_lock);
for_each_process(tsk) {
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ pgoff_t pgoff = page_to_pgoff(page);
+ struct task_struct *t = task_early_kill(tsk, force_early);
- if (!task_early_kill(tsk))
+ if (!t)
continue;
-
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
pgoff) {
/*
@@ -455,8 +483,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* Assume applications who requested early kill want
* to be informed of all such data corruptions.
*/
- if (vma->vm_mm == tsk->mm)
- add_to_kill(tsk, page, vma, to_kill, tkc);
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, vma, to_kill, tkc);
}
}
read_unlock(&tasklist_lock);
@@ -469,7 +497,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* First preallocate one tokill structure outside the spin locks,
* so that we can kill at least one process reasonably reliable.
*/
-static void collect_procs(struct page *page, struct list_head *tokill)
+static void collect_procs(struct page *page, struct list_head *tokill,
+ int force_early)
{
struct to_kill *tk;
@@ -480,9 +509,9 @@ static void collect_procs(struct page *page, struct list_head *tokill)
if (!tk)
return;
if (PageAnon(page))
- collect_procs_anon(page, tokill, &tk);
+ collect_procs_anon(page, tokill, &tk, force_early);
else
- collect_procs_file(page, tokill, &tk);
+ collect_procs_file(page, tokill, &tk, force_early);
kfree(tk);
}
@@ -611,7 +640,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
}
/*
- * Dirty cache page page
+ * Dirty pagecache page
* Issues: when the error hit a hole page the error is not properly
* propagated.
*/
@@ -856,18 +885,24 @@ static int page_action(struct page_state *ps, struct page *p,
* the pages and send SIGBUS to the processes if the data was dirty.
*/
static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
- int trapno, int flags)
+ int trapno, int flags, struct page **hpagep)
{
enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
int kill = 1, forcekill;
- struct page *hpage = compound_head(p);
+ struct page *hpage = *hpagep;
struct page *ppage;
+ /*
+ * Here we are interested only in user-mapped pages, so skip any
+ * other types of pages.
+ */
if (PageReserved(p) || PageSlab(p))
return SWAP_SUCCESS;
+ if (!(PageLRU(hpage) || PageHuge(p)))
+ return SWAP_SUCCESS;
/*
* This check implies we don't kill processes if their pages
@@ -876,8 +911,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (!page_mapped(hpage))
return SWAP_SUCCESS;
- if (PageKsm(p))
+ if (PageKsm(p)) {
+ pr_err("MCE %#lx: can't handle KSM pages.\n", pfn);
return SWAP_FAIL;
+ }
if (PageSwapCache(p)) {
printk(KERN_ERR
@@ -938,6 +975,21 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
BUG_ON(!PageHWPoison(p));
return SWAP_FAIL;
}
+ /*
+ * We pinned the head page for hwpoison handling,
+ * now we split the thp and we are interested in
+ * the hwpoisoned raw page, so move the refcount
+ * to it. Similarly, page lock is shifted.
+ */
+ if (hpage != p) {
+ if (!(flags & MF_COUNT_INCREASED)) {
+ put_page(hpage);
+ get_page(p);
+ }
+ lock_page(p);
+ unlock_page(hpage);
+ *hpagep = p;
+ }
/* THP is split, so ppage should be the real poisoned page. */
ppage = p;
}
@@ -952,19 +1004,13 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(ppage, &tokill);
-
- if (hpage != ppage)
- lock_page(ppage);
+ collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
ret = try_to_unmap(ppage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(ppage));
- if (hpage != ppage)
- unlock_page(ppage);
-
/*
* Now that the dirty bit has been propagated to the
* struct page and all unmaps done we can decide if
@@ -1076,15 +1122,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
} else if (PageHuge(hpage)) {
/*
- * Check "just unpoisoned", "filter hit", and
- * "race with other subpage."
+ * Check "filter hit" and "race with other subpage."
*/
lock_page(hpage);
- if (!PageHWPoison(hpage)
- || (hwpoison_filter(p) && TestClearPageHWPoison(p))
- || (p != hpage && TestSetPageHWPoison(hpage))) {
- atomic_long_sub(nr_pages, &num_poisoned_pages);
- return 0;
+ if (PageHWPoison(hpage)) {
+ if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != hpage && TestSetPageHWPoison(hpage))) {
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ unlock_page(hpage);
+ return 0;
+ }
}
set_page_hwpoison_huge_page(hpage);
res = dequeue_hwpoisoned_huge_page(hpage);
@@ -1120,17 +1167,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
action_result(pfn, "free buddy, 2nd try", DELAYED);
return 0;
}
- action_result(pfn, "non LRU", IGNORED);
- put_page(p);
- return -EBUSY;
}
}
- /*
- * Lock the page and wait for writeback to finish.
- * It's very difficult to mess with pages currently under IO
- * and in many cases impossible, so we just avoid it here.
- */
lock_page(hpage);
/*
@@ -1147,6 +1186,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (!PageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ put_page(hpage);
res = 0;
goto out;
}
@@ -1158,6 +1199,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
}
+ if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
+ goto identify_page_state;
+
/*
* For error on the tail page, we should set PG_hwpoison
* on the head page to show that the hugepage is hwpoisoned
@@ -1178,14 +1222,22 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (PageHuge(p))
set_page_hwpoison_huge_page(hpage);
+ /*
+ * It's very difficult to mess with pages currently under IO
+ * and in many cases impossible, so we just avoid it here.
+ */
wait_on_page_writeback(p);
/*
* Now take care of user space mappings.
* Abort on fail: __delete_from_page_cache() assumes unmapped page.
+ *
+ * When the raw error page is thp tail page, hpage points to the raw
+ * page after thp split.
*/
- if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
- printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
+ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
+ != SWAP_SUCCESS) {
+ action_result(pfn, "unmapping failed", IGNORED);
res = -EBUSY;
goto out;
}
@@ -1199,6 +1251,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
goto out;
}
+identify_page_state:
res = -EBUSY;
/*
* The first check uses the current page flags which may not have any
@@ -1286,7 +1339,7 @@ static void memory_failure_work_func(struct work_struct *work)
unsigned long proc_flags;
int gotten;
- mf_cpu = &__get_cpu_var(memory_failure_cpu);
+ mf_cpu = this_cpu_ptr(&memory_failure_cpu);
for (;;) {
spin_lock_irqsave(&mf_cpu->lock, proc_flags);
gotten = kfifo_get(&mf_cpu->fifo, &entry);
@@ -1491,7 +1544,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
/* Keep page count to indicate a given hugepage is isolated. */
list_move(&hpage->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1505,10 +1558,16 @@ static int soft_offline_huge_page(struct page *page, int flags)
if (ret > 0)
ret = -EIO;
} else {
- set_page_hwpoison_huge_page(hpage);
- dequeue_hwpoisoned_huge_page(hpage);
- atomic_long_add(1 << compound_order(hpage),
- &num_poisoned_pages);
+ /* overcommit hugetlb page will be freed to buddy */
+ if (PageHuge(page)) {
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ atomic_long_add(1 << compound_order(hpage),
+ &num_poisoned_pages);
+ } else {
+ SetPageHWPoison(page);
+ atomic_long_inc(&num_poisoned_pages);
+ }
}
return ret;
}
@@ -1566,10 +1625,16 @@ static int __soft_offline_page(struct page *page, int flags)
inc_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+ ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
- putback_lru_pages(&pagelist);
+ if (!list_empty(&pagelist)) {
+ list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ putback_lru_page(page);
+ }
+
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
@@ -1626,7 +1691,7 @@ int soft_offline_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
- struct page *hpage = compound_trans_head(page);
+ struct page *hpage = compound_head(page);
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
@@ -1640,11 +1705,7 @@ int soft_offline_page(struct page *page, int flags)
}
}
- /*
- * The lock_memory_hotplug prevents a race with memory hotplug.
- * This is a big hammer, a better would be nicer.
- */
- lock_memory_hotplug();
+ get_online_mems();
/*
* Isolate the page, so that it doesn't get reallocated if it
@@ -1655,7 +1716,7 @@ int soft_offline_page(struct page *page, int flags)
set_migratetype_isolate(page, true);
ret = get_any_page(page, pfn, flags);
- unlock_memory_hotplug();
+ put_online_mems();
if (ret > 0) { /* for in-use pages */
if (PageHuge(page))
ret = soft_offline_huge_page(page, flags);
diff --git a/mm/memory.c b/mm/memory.c
index 5d9025f3b3e..8b44f765b64 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -59,6 +59,8 @@
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
+#include <linux/dma-debug.h>
+#include <linux/debugfs.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -230,17 +232,18 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long
#endif
}
-void tlb_flush_mmu(struct mmu_gather *tlb)
+static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
{
- struct mmu_gather_batch *batch;
-
- if (!tlb->need_flush)
- return;
tlb->need_flush = 0;
tlb_flush(tlb);
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb_table_flush(tlb);
#endif
+}
+
+static void tlb_flush_mmu_free(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
for (batch = &tlb->local; batch; batch = batch->next) {
free_pages_and_swap_cache(batch->pages, batch->nr);
@@ -249,6 +252,14 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
tlb->active = &tlb->local;
}
+void tlb_flush_mmu(struct mmu_gather *tlb)
+{
+ if (!tlb->need_flush)
+ return;
+ tlb_flush_mmu_tlbonly(tlb);
+ tlb_flush_mmu_free(tlb);
+}
+
/* tlb_finish_mmu
* Called at the end of the shootdown operation to free up any resources
* that were required.
@@ -288,7 +299,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
return 0;
batch = tlb->active;
}
- VM_BUG_ON(batch->nr > batch->max);
+ VM_BUG_ON_PAGE(batch->nr > batch->max, page);
return batch->max - batch->nr;
}
@@ -670,7 +681,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
current->comm,
(long long)pte_val(pte), (long long)pmd_val(*pmd));
if (page)
- dump_page(page);
+ dump_page(page, "bad pte");
printk(KERN_ALERT
"addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
@@ -687,11 +698,6 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
-static inline bool is_cow_mapping(vm_flags_t flags)
-{
- return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
-}
-
/*
* vm_normal_page -- This function gets the "struct page" associated with a pte.
*
@@ -745,7 +751,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn = pte_pfn(pte);
if (HAVE_PTE_SPECIAL) {
- if (likely(!pte_special(pte)))
+ if (likely(!pte_special(pte) || pte_numa(pte)))
goto check_pfn;
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
@@ -771,14 +777,15 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
}
}
- if (is_zero_pfn(pfn))
- return NULL;
check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
+ if (is_zero_pfn(pfn))
+ return NULL;
+
/*
* NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist.
@@ -1125,8 +1132,10 @@ again:
if (PageAnon(page))
rss[MM_ANONPAGES]--;
else {
- if (pte_dirty(ptent))
+ if (pte_dirty(ptent)) {
+ force_flush = 1;
set_page_dirty(page);
+ }
if (pte_young(ptent) &&
likely(!(vma->vm_flags & VM_SEQ_READ)))
mark_page_accessed(page);
@@ -1135,9 +1144,10 @@ again:
page_remove_rmap(page);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
- force_flush = !__tlb_remove_page(tlb, page);
- if (force_flush)
+ if (unlikely(!__tlb_remove_page(tlb, page))) {
+ force_flush = 1;
break;
+ }
continue;
}
/*
@@ -1172,18 +1182,11 @@ again:
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(start_pte, ptl);
- /*
- * mmu_gather ran out of room to batch pages, we break out of
- * the PTE lock to avoid doing the potential expensive TLB invalidate
- * and page-free while holding it.
- */
+ /* Do the actual TLB flush before dropping ptl */
if (force_flush) {
unsigned long old_end;
- force_flush = 0;
-
/*
* Flush the TLB just for the previous segment,
* then update the range to be the remaining
@@ -1191,11 +1194,21 @@ again:
*/
old_end = tlb->end;
tlb->end = addr;
-
- tlb_flush_mmu(tlb);
-
+ tlb_flush_mmu_tlbonly(tlb);
tlb->start = addr;
tlb->end = old_end;
+ }
+ pte_unmap_unlock(start_pte, ptl);
+
+ /*
+ * If we forced a TLB flush (either due to running out of
+ * batch buffers or because we needed to flush dirty TLB
+ * entries before releasing the ptl), free the batched
+ * memory too. Restart if we didn't do everything.
+ */
+ if (force_flush) {
+ force_flush = 0;
+ tlb_flush_mmu_free(tlb);
if (addr != end)
goto again;
@@ -1319,9 +1332,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* It is undesirable to test vma->vm_file as it
* should be non-null for valid hugetlb area.
* However, vm_file will be NULL in the error
- * cleanup path of do_mmap_pgoff. When
+ * cleanup path of mmap_region. When
* hugetlbfs ->mmap method fails,
- * do_mmap_pgoff() nullifies vma->vm_file
+ * mmap_region() nullifies vma->vm_file
* before calling this function to clean up.
* Since no pte has actually been setup, it is
* safe to do nothing in this case.
@@ -1440,617 +1453,6 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);
-/**
- * follow_page_mask - look up a page descriptor from a user-virtual address
- * @vma: vm_area_struct mapping @address
- * @address: virtual address to look up
- * @flags: flags modifying lookup behaviour
- * @page_mask: on output, *page_mask is set according to the size of the page
- *
- * @flags can have FOLL_ flags set, defined in <linux/mm.h>
- *
- * Returns the mapped (struct page *), %NULL if no mapping exists, or
- * an error pointer if there is a mapping to something not represented
- * by a page descriptor (see also vm_normal_page()).
- */
-struct page *follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags,
- unsigned int *page_mask)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *ptep, pte;
- spinlock_t *ptl;
- struct page *page;
- struct mm_struct *mm = vma->vm_mm;
-
- *page_mask = 0;
-
- page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
- if (!IS_ERR(page)) {
- BUG_ON(flags & FOLL_GET);
- goto out;
- }
-
- page = NULL;
- pgd = pgd_offset(mm, address);
- if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
- goto no_page_table;
-
- pud = pud_offset(pgd, address);
- if (pud_none(*pud))
- goto no_page_table;
- if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
- if (flags & FOLL_GET)
- goto out;
- page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
- goto out;
- }
- if (unlikely(pud_bad(*pud)))
- goto no_page_table;
-
- pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd))
- goto no_page_table;
- if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
- page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
- if (flags & FOLL_GET) {
- /*
- * Refcount on tail pages are not well-defined and
- * shouldn't be taken. The caller should handle a NULL
- * return when trying to follow tail pages.
- */
- if (PageHead(page))
- get_page(page);
- else {
- page = NULL;
- goto out;
- }
- }
- goto out;
- }
- if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
- goto no_page_table;
- if (pmd_trans_huge(*pmd)) {
- if (flags & FOLL_SPLIT) {
- split_huge_page_pmd(vma, address, pmd);
- goto split_fallthrough;
- }
- ptl = pmd_lock(mm, pmd);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(ptl);
- wait_split_huge_page(vma->anon_vma, pmd);
- } else {
- page = follow_trans_huge_pmd(vma, address,
- pmd, flags);
- spin_unlock(ptl);
- *page_mask = HPAGE_PMD_NR - 1;
- goto out;
- }
- } else
- spin_unlock(ptl);
- /* fall through */
- }
-split_fallthrough:
- if (unlikely(pmd_bad(*pmd)))
- goto no_page_table;
-
- ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
-
- pte = *ptep;
- if (!pte_present(pte)) {
- swp_entry_t entry;
- /*
- * KSM's break_ksm() relies upon recognizing a ksm page
- * even while it is being migrated, so for that case we
- * need migration_entry_wait().
- */
- if (likely(!(flags & FOLL_MIGRATION)))
- goto no_page;
- if (pte_none(pte) || pte_file(pte))
- goto no_page;
- entry = pte_to_swp_entry(pte);
- if (!is_migration_entry(entry))
- goto no_page;
- pte_unmap_unlock(ptep, ptl);
- migration_entry_wait(mm, pmd, address);
- goto split_fallthrough;
- }
- if ((flags & FOLL_NUMA) && pte_numa(pte))
- goto no_page;
- if ((flags & FOLL_WRITE) && !pte_write(pte))
- goto unlock;
-
- page = vm_normal_page(vma, address, pte);
- if (unlikely(!page)) {
- if ((flags & FOLL_DUMP) ||
- !is_zero_pfn(pte_pfn(pte)))
- goto bad_page;
- page = pte_page(pte);
- }
-
- if (flags & FOLL_GET)
- get_page_foll(page);
- if (flags & FOLL_TOUCH) {
- if ((flags & FOLL_WRITE) &&
- !pte_dirty(pte) && !PageDirty(page))
- set_page_dirty(page);
- /*
- * pte_mkyoung() would be more correct here, but atomic care
- * is needed to avoid losing the dirty bit: it is easier to use
- * mark_page_accessed().
- */
- mark_page_accessed(page);
- }
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
- /*
- * The preliminary mapping check is mainly to avoid the
- * pointless overhead of lock_page on the ZERO_PAGE
- * which might bounce very badly if there is contention.
- *
- * If the page is already locked, we don't need to
- * handle it now - vmscan will handle it later if and
- * when it attempts to reclaim the page.
- */
- if (page->mapping && trylock_page(page)) {
- lru_add_drain(); /* push cached pages to LRU */
- /*
- * Because we lock page here, and migration is
- * blocked by the pte's page reference, and we
- * know the page is still mapped, we don't even
- * need to check for file-cache page truncation.
- */
- mlock_vma_page(page);
- unlock_page(page);
- }
- }
-unlock:
- pte_unmap_unlock(ptep, ptl);
-out:
- return page;
-
-bad_page:
- pte_unmap_unlock(ptep, ptl);
- return ERR_PTR(-EFAULT);
-
-no_page:
- pte_unmap_unlock(ptep, ptl);
- if (!pte_none(pte))
- return page;
-
-no_page_table:
- /*
- * When core dumping an enormous anonymous area that nobody
- * has touched so far, we don't want to allocate unnecessary pages or
- * page tables. Return error instead of NULL to skip handle_mm_fault,
- * then get_dump_page() will return NULL to leave a hole in the dump.
- * But we can only make this optimization where a hole would surely
- * be zero-filled if handle_mm_fault() actually did handle it.
- */
- if ((flags & FOLL_DUMP) &&
- (!vma->vm_ops || !vma->vm_ops->fault))
- return ERR_PTR(-EFAULT);
- return page;
-}
-
-static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
-{
- return stack_guard_page_start(vma, addr) ||
- stack_guard_page_end(vma, addr+PAGE_SIZE);
-}
-
-/**
- * __get_user_pages() - pin user pages in memory
- * @tsk: task_struct of target task
- * @mm: mm_struct of target mm
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @gup_flags: flags modifying pin behaviour
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
- * @nonblocking: whether waiting for disk IO or mmap_sem contention
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno. Each page returned must be released
- * with a put_page() call when it is finished with. vmas will only
- * remain valid while mmap_sem is held.
- *
- * Must be called with mmap_sem held for read or write.
- *
- * __get_user_pages walks a process's page tables and takes a reference to
- * each struct page that each user address corresponds to at a given
- * instant. That is, it takes the page that would be accessed if a user
- * thread accesses the given user virtual address at that instant.
- *
- * This does not guarantee that the page exists in the user mappings when
- * __get_user_pages returns, and there may even be a completely different
- * page there in some cases (eg. if mmapped pagecache has been invalidated
- * and subsequently re faulted). However it does guarantee that the page
- * won't be freed completely. And mostly callers simply care that the page
- * contains data that was valid *at some point in time*. Typically, an IO
- * or similar operation cannot guarantee anything stronger anyway because
- * locks can't be held over the syscall boundary.
- *
- * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
- * the page is written to, set_page_dirty (or set_page_dirty_lock, as
- * appropriate) must be called after the page is finished with, and
- * before put_page is called.
- *
- * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
- * or mmap_sem contention, and if waiting is needed to pin all pages,
- * *@nonblocking will be set to 0.
- *
- * In most cases, get_user_pages or get_user_pages_fast should be used
- * instead of __get_user_pages. __get_user_pages should be used only if
- * you need some special @gup_flags.
- */
-long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *nonblocking)
-{
- long i;
- unsigned long vm_flags;
- unsigned int page_mask;
-
- if (!nr_pages)
- return 0;
-
- VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
-
- /*
- * Require read or write permissions.
- * If FOLL_FORCE is set, we only require the "MAY" flags.
- */
- vm_flags = (gup_flags & FOLL_WRITE) ?
- (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
- vm_flags &= (gup_flags & FOLL_FORCE) ?
- (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
-
- /*
- * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
- * would be called on PROT_NONE ranges. We must never invoke
- * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
- * page faults would unprotect the PROT_NONE ranges if
- * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
- * bitflag. So to avoid that, don't set FOLL_NUMA if
- * FOLL_FORCE is set.
- */
- if (!(gup_flags & FOLL_FORCE))
- gup_flags |= FOLL_NUMA;
-
- i = 0;
-
- do {
- struct vm_area_struct *vma;
-
- vma = find_extend_vma(mm, start);
- if (!vma && in_gate_area(mm, start)) {
- unsigned long pg = start & PAGE_MASK;
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- /* user gate pages are read-only */
- if (gup_flags & FOLL_WRITE)
- return i ? : -EFAULT;
- if (pg > TASK_SIZE)
- pgd = pgd_offset_k(pg);
- else
- pgd = pgd_offset_gate(mm, pg);
- BUG_ON(pgd_none(*pgd));
- pud = pud_offset(pgd, pg);
- BUG_ON(pud_none(*pud));
- pmd = pmd_offset(pud, pg);
- if (pmd_none(*pmd))
- return i ? : -EFAULT;
- VM_BUG_ON(pmd_trans_huge(*pmd));
- pte = pte_offset_map(pmd, pg);
- if (pte_none(*pte)) {
- pte_unmap(pte);
- return i ? : -EFAULT;
- }
- vma = get_gate_vma(mm);
- if (pages) {
- struct page *page;
-
- page = vm_normal_page(vma, start, *pte);
- if (!page) {
- if (!(gup_flags & FOLL_DUMP) &&
- is_zero_pfn(pte_pfn(*pte)))
- page = pte_page(*pte);
- else {
- pte_unmap(pte);
- return i ? : -EFAULT;
- }
- }
- pages[i] = page;
- get_page(page);
- }
- pte_unmap(pte);
- page_mask = 0;
- goto next_page;
- }
-
- if (!vma ||
- (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
- !(vm_flags & vma->vm_flags))
- return i ? : -EFAULT;
-
- if (is_vm_hugetlb_page(vma)) {
- i = follow_hugetlb_page(mm, vma, pages, vmas,
- &start, &nr_pages, i, gup_flags);
- continue;
- }
-
- do {
- struct page *page;
- unsigned int foll_flags = gup_flags;
- unsigned int page_increm;
-
- /*
- * If we have a pending SIGKILL, don't keep faulting
- * pages and potentially allocating memory.
- */
- if (unlikely(fatal_signal_pending(current)))
- return i ? i : -ERESTARTSYS;
-
- cond_resched();
- while (!(page = follow_page_mask(vma, start,
- foll_flags, &page_mask))) {
- int ret;
- unsigned int fault_flags = 0;
-
- /* For mlock, just skip the stack guard page. */
- if (foll_flags & FOLL_MLOCK) {
- if (stack_guard_page(vma, start))
- goto next_page;
- }
- if (foll_flags & FOLL_WRITE)
- fault_flags |= FAULT_FLAG_WRITE;
- if (nonblocking)
- fault_flags |= FAULT_FLAG_ALLOW_RETRY;
- if (foll_flags & FOLL_NOWAIT)
- fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
-
- ret = handle_mm_fault(mm, vma, start,
- fault_flags);
-
- if (ret & VM_FAULT_ERROR) {
- if (ret & VM_FAULT_OOM)
- return i ? i : -ENOMEM;
- if (ret & (VM_FAULT_HWPOISON |
- VM_FAULT_HWPOISON_LARGE)) {
- if (i)
- return i;
- else if (gup_flags & FOLL_HWPOISON)
- return -EHWPOISON;
- else
- return -EFAULT;
- }
- if (ret & VM_FAULT_SIGBUS)
- return i ? i : -EFAULT;
- BUG();
- }
-
- if (tsk) {
- if (ret & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- }
-
- if (ret & VM_FAULT_RETRY) {
- if (nonblocking)
- *nonblocking = 0;
- return i;
- }
-
- /*
- * The VM_FAULT_WRITE bit tells us that
- * do_wp_page has broken COW when necessary,
- * even if maybe_mkwrite decided not to set
- * pte_write. We can thus safely do subsequent
- * page lookups as if they were reads. But only
- * do so when looping for pte_write is futile:
- * in some cases userspace may also be wanting
- * to write to the gotten user page, which a
- * read fault here might prevent (a readonly
- * page might get reCOWed by userspace write).
- */
- if ((ret & VM_FAULT_WRITE) &&
- !(vma->vm_flags & VM_WRITE))
- foll_flags &= ~FOLL_WRITE;
-
- cond_resched();
- }
- if (IS_ERR(page))
- return i ? i : PTR_ERR(page);
- if (pages) {
- pages[i] = page;
-
- flush_anon_page(vma, page, start);
- flush_dcache_page(page);
- page_mask = 0;
- }
-next_page:
- if (vmas) {
- vmas[i] = vma;
- page_mask = 0;
- }
- page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
- if (page_increm > nr_pages)
- page_increm = nr_pages;
- i += page_increm;
- start += page_increm * PAGE_SIZE;
- nr_pages -= page_increm;
- } while (nr_pages && start < vma->vm_end);
- } while (nr_pages);
- return i;
-}
-EXPORT_SYMBOL(__get_user_pages);
-
-/*
- * fixup_user_fault() - manually resolve a user page fault
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
- * @mm: mm_struct of target mm
- * @address: user address
- * @fault_flags:flags to pass down to handle_mm_fault()
- *
- * This is meant to be called in the specific scenario where for locking reasons
- * we try to access user memory in atomic context (within a pagefault_disable()
- * section), this returns -EFAULT, and we want to resolve the user fault before
- * trying again.
- *
- * Typically this is meant to be used by the futex code.
- *
- * The main difference with get_user_pages() is that this function will
- * unconditionally call handle_mm_fault() which will in turn perform all the
- * necessary SW fixup of the dirty and young bits in the PTE, while
- * handle_mm_fault() only guarantees to update these in the struct page.
- *
- * This is important for some architectures where those bits also gate the
- * access permission to the page because they are maintained in software. On
- * such architectures, gup() will not be enough to make a subsequent access
- * succeed.
- *
- * This should be called with the mm_sem held for read.
- */
-int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long address, unsigned int fault_flags)
-{
- struct vm_area_struct *vma;
- int ret;
-
- vma = find_extend_vma(mm, address);
- if (!vma || address < vma->vm_start)
- return -EFAULT;
-
- ret = handle_mm_fault(mm, vma, address, fault_flags);
- if (ret & VM_FAULT_ERROR) {
- if (ret & VM_FAULT_OOM)
- return -ENOMEM;
- if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
- return -EHWPOISON;
- if (ret & VM_FAULT_SIGBUS)
- return -EFAULT;
- BUG();
- }
- if (tsk) {
- if (ret & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- }
- return 0;
-}
-
-/*
- * get_user_pages() - pin user pages in memory
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
- * @mm: mm_struct of target mm
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to by the caller
- * @force: whether to force write access even if user mapping is
- * readonly. This will result in the page being COWed even
- * in MAP_SHARED mappings. You do not want this.
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno. Each page returned must be released
- * with a put_page() call when it is finished with. vmas will only
- * remain valid while mmap_sem is held.
- *
- * Must be called with mmap_sem held for read or write.
- *
- * get_user_pages walks a process's page tables and takes a reference to
- * each struct page that each user address corresponds to at a given
- * instant. That is, it takes the page that would be accessed if a user
- * thread accesses the given user virtual address at that instant.
- *
- * This does not guarantee that the page exists in the user mappings when
- * get_user_pages returns, and there may even be a completely different
- * page there in some cases (eg. if mmapped pagecache has been invalidated
- * and subsequently re faulted). However it does guarantee that the page
- * won't be freed completely. And mostly callers simply care that the page
- * contains data that was valid *at some point in time*. Typically, an IO
- * or similar operation cannot guarantee anything stronger anyway because
- * locks can't be held over the syscall boundary.
- *
- * If write=0, the page must not be written to. If the page is written to,
- * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
- * after the page is finished with, and before put_page is called.
- *
- * get_user_pages is typically used for fewer-copy IO operations, to get a
- * handle on the memory by some means other than accesses via the user virtual
- * addresses. The pages may be submitted for DMA to devices or accessed via
- * their kernel linear mapping (via the kmap APIs). Care should be taken to
- * use the correct cache flushing APIs.
- *
- * See also get_user_pages_fast, for performance critical applications.
- */
-long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages, int write,
- int force, struct page **pages, struct vm_area_struct **vmas)
-{
- int flags = FOLL_TOUCH;
-
- if (pages)
- flags |= FOLL_GET;
- if (write)
- flags |= FOLL_WRITE;
- if (force)
- flags |= FOLL_FORCE;
-
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
- NULL);
-}
-EXPORT_SYMBOL(get_user_pages);
-
-/**
- * get_dump_page() - pin user page in memory while writing it to core dump
- * @addr: user address
- *
- * Returns struct page pointer of user page pinned for dump,
- * to be freed afterwards by page_cache_release() or put_page().
- *
- * Returns NULL on any kind of failure - a hole must then be inserted into
- * the corefile, to preserve alignment with its headers; and also returns
- * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
- * allowing a hole to be left in the corefile to save diskspace.
- *
- * Called without mmap_sem, but after all other threads have been killed.
- */
-#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
-{
- struct vm_area_struct *vma;
- struct page *page;
-
- if (__get_user_pages(current, current->mm, addr, 1,
- FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
- NULL) < 1)
- return NULL;
- flush_cache_page(vma, addr, page_to_pfn(page));
- return page;
-}
-#endif /* CONFIG_ELF_CORE */
-
pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
spinlock_t **ptl)
{
@@ -2559,6 +1961,8 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
{
+ debug_dma_assert_idle(src);
+
/*
* If the source page was a PFN mapping, we don't have
* a "struct page" for it. We do a best-effort copy by
@@ -2584,6 +1988,38 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
}
/*
+ * Notify the address space that the page is about to become writable so that
+ * it can prohibit this or wait for the page to get into an appropriate state.
+ *
+ * We do this without the lock held, so that it can sleep if it needs to.
+ */
+static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+ unsigned long address)
+{
+ struct vm_fault vmf;
+ int ret;
+
+ vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+ vmf.pgoff = page->index;
+ vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+ vmf.page = page;
+
+ ret = vma->vm_ops->page_mkwrite(vma, &vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+ return ret;
+ if (unlikely(!(ret & VM_FAULT_LOCKED))) {
+ lock_page(page);
+ if (!page->mapping) {
+ unlock_page(page);
+ return 0; /* retry */
+ }
+ ret |= VM_FAULT_LOCKED;
+ } else
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ return ret;
+}
+
+/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
* and decrementing the shared-page counter for the old page.
@@ -2665,42 +2101,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
* get_user_pages(.write=1, .force=1).
*/
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
- struct vm_fault vmf;
int tmp;
-
- vmf.virtual_address = (void __user *)(address &
- PAGE_MASK);
- vmf.pgoff = old_page->index;
- vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- vmf.page = old_page;
-
- /*
- * Notify the address space that the page is about to
- * become writable so that it can prohibit this or wait
- * for the page to get into an appropriate state.
- *
- * We do this without the lock held, so that it can
- * sleep if it needs to.
- */
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
-
- tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
- if (unlikely(tmp &
- (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
- ret = tmp;
- goto unwritable_page;
+ tmp = do_page_mkwrite(vma, old_page, address);
+ if (unlikely(!tmp || (tmp &
+ (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+ page_cache_release(old_page);
+ return tmp;
}
- if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
- lock_page(old_page);
- if (!old_page->mapping) {
- ret = 0; /* retry the fault */
- unlock_page(old_page);
- goto unwritable_page;
- }
- } else
- VM_BUG_ON(!PageLocked(old_page));
-
/*
* Since we dropped the lock we need to revalidate
* the PTE as someone else may have changed it. If
@@ -2745,11 +2154,11 @@ reuse:
* bit after it clear all dirty ptes, but before a racing
* do_wp_page installs a dirty pte.
*
- * __do_fault is protected similarly.
+ * do_shared_fault is protected similarly.
*/
if (!page_mkwrite) {
wait_on_page_locked(dirty_page);
- set_page_dirty_balance(dirty_page, page_mkwrite);
+ set_page_dirty_balance(dirty_page);
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
@@ -2795,7 +2204,7 @@ gotten:
}
__SetPageUptodate(new_page);
- if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
+ if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
goto oom_free_new;
mmun_start = address & PAGE_MASK;
@@ -2889,10 +2298,6 @@ oom:
if (old_page)
page_cache_release(old_page);
return VM_FAULT_OOM;
-
-unwritable_page:
- page_cache_release(old_page);
- return ret;
}
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -3252,7 +2657,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
__SetPageUptodate(page);
- if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
+ if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL))
goto oom_free_page;
entry = mk_pte(page, vma->vm_page_prot);
@@ -3283,53 +2688,11 @@ oom:
return VM_FAULT_OOM;
}
-/*
- * __do_fault() tries to create a new page mapping. It aggressively
- * tries to share with existing pages, but makes a separate copy if
- * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
- * the next page fault.
- *
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte neither mapped nor locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
- */
-static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd,
- pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+static int __do_fault(struct vm_area_struct *vma, unsigned long address,
+ pgoff_t pgoff, unsigned int flags, struct page **page)
{
- pte_t *page_table;
- spinlock_t *ptl;
- struct page *page;
- struct page *cow_page;
- pte_t entry;
- int anon = 0;
- struct page *dirty_page = NULL;
struct vm_fault vmf;
int ret;
- int page_mkwrite = 0;
-
- /*
- * If we do COW later, allocate page befor taking lock_page()
- * on the file cache page. This will reduce lock holding time.
- */
- if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
-
- if (unlikely(anon_vma_prepare(vma)))
- return VM_FAULT_OOM;
-
- cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
- if (!cow_page)
- return VM_FAULT_OOM;
-
- if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
- page_cache_release(cow_page);
- return VM_FAULT_OOM;
- }
- } else
- cow_page = NULL;
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.pgoff = pgoff;
@@ -3337,150 +2700,319 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vmf.page = NULL;
ret = vma->vm_ops->fault(vma, &vmf);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
- VM_FAULT_RETRY)))
- goto uncharge_out;
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ return ret;
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
unlock_page(vmf.page);
- ret = VM_FAULT_HWPOISON;
- goto uncharge_out;
+ page_cache_release(vmf.page);
+ return VM_FAULT_HWPOISON;
}
- /*
- * For consistency in subsequent calls, make the faulted page always
- * locked.
- */
if (unlikely(!(ret & VM_FAULT_LOCKED)))
lock_page(vmf.page);
else
- VM_BUG_ON(!PageLocked(vmf.page));
+ VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
+
+ *page = vmf.page;
+ return ret;
+}
+
+/**
+ * do_set_pte - setup new PTE entry for given page and add reverse page mapping.
+ *
+ * @vma: virtual memory area
+ * @address: user virtual address
+ * @page: page to map
+ * @pte: pointer to target page table entry
+ * @write: true, if new entry is writable
+ * @anon: true, if it's anonymous page
+ *
+ * Caller must hold page table lock relevant for @pte.
+ *
+ * Target users are page handler itself and implementations of
+ * vm_ops->map_pages.
+ */
+void do_set_pte(struct vm_area_struct *vma, unsigned long address,
+ struct page *page, pte_t *pte, bool write, bool anon)
+{
+ pte_t entry;
+
+ flush_icache_page(vma, page);
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (write)
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
+ pte_mksoft_dirty(entry);
+ if (anon) {
+ inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, address);
+ } else {
+ inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
+ page_add_file_rmap(page);
+ }
+ set_pte_at(vma->vm_mm, address, pte, entry);
+
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, address, pte);
+}
+
+static unsigned long fault_around_bytes = rounddown_pow_of_two(65536);
+
+static inline unsigned long fault_around_pages(void)
+{
+ return fault_around_bytes >> PAGE_SHIFT;
+}
+
+static inline unsigned long fault_around_mask(void)
+{
+ return ~(fault_around_bytes - 1) & PAGE_MASK;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int fault_around_bytes_get(void *data, u64 *val)
+{
+ *val = fault_around_bytes;
+ return 0;
+}
+
+/*
+ * fault_around_pages() and fault_around_mask() expects fault_around_bytes
+ * rounded down to nearest page order. It's what do_fault_around() expects to
+ * see.
+ */
+static int fault_around_bytes_set(void *data, u64 val)
+{
+ if (val / PAGE_SIZE > PTRS_PER_PTE)
+ return -EINVAL;
+ if (val > PAGE_SIZE)
+ fault_around_bytes = rounddown_pow_of_two(val);
+ else
+ fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
+ fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
+
+static int __init fault_around_debugfs(void)
+{
+ void *ret;
+
+ ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
+ &fault_around_bytes_fops);
+ if (!ret)
+ pr_warn("Failed to create fault_around_bytes in debugfs");
+ return 0;
+}
+late_initcall(fault_around_debugfs);
+#endif
+
+/*
+ * do_fault_around() tries to map few pages around the fault address. The hope
+ * is that the pages will be needed soon and this will lower the number of
+ * faults to handle.
+ *
+ * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
+ * not ready to be mapped: not up-to-date, locked, etc.
+ *
+ * This function is called with the page table lock taken. In the split ptlock
+ * case the page table lock only protects only those entries which belong to
+ * the page table corresponding to the fault address.
+ *
+ * This function doesn't cross the VMA boundaries, in order to call map_pages()
+ * only once.
+ *
+ * fault_around_pages() defines how many pages we'll try to map.
+ * do_fault_around() expects it to return a power of two less than or equal to
+ * PTRS_PER_PTE.
+ *
+ * The virtual address of the area that we map is naturally aligned to the
+ * fault_around_pages() value (and therefore to page order). This way it's
+ * easier to guarantee that we don't cross page table boundaries.
+ */
+static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pgoff_t pgoff, unsigned int flags)
+{
+ unsigned long start_addr;
+ pgoff_t max_pgoff;
+ struct vm_fault vmf;
+ int off;
+
+ start_addr = max(address & fault_around_mask(), vma->vm_start);
+ off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+ pte -= off;
+ pgoff -= off;
/*
- * Should we do an early C-O-W break?
+ * max_pgoff is either end of page table or end of vma
+ * or fault_around_pages() from pgoff, depending what is nearest.
*/
- page = vmf.page;
- if (flags & FAULT_FLAG_WRITE) {
- if (!(vma->vm_flags & VM_SHARED)) {
- page = cow_page;
- anon = 1;
- copy_user_highpage(page, vmf.page, address, vma);
- __SetPageUptodate(page);
- } else {
- /*
- * If the page will be shareable, see if the backing
- * address space wants to know that the page is about
- * to become writable
- */
- if (vma->vm_ops->page_mkwrite) {
- int tmp;
-
- unlock_page(page);
- vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
- if (unlikely(tmp &
- (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
- ret = tmp;
- goto unwritable_page;
- }
- if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
- lock_page(page);
- if (!page->mapping) {
- ret = 0; /* retry the fault */
- unlock_page(page);
- goto unwritable_page;
- }
- } else
- VM_BUG_ON(!PageLocked(page));
- page_mkwrite = 1;
- }
- }
-
+ max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+ PTRS_PER_PTE - 1;
+ max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
+ pgoff + fault_around_pages() - 1);
+
+ /* Check if it makes any sense to call ->map_pages */
+ while (!pte_none(*pte)) {
+ if (++pgoff > max_pgoff)
+ return;
+ start_addr += PAGE_SIZE;
+ if (start_addr >= vma->vm_end)
+ return;
+ pte++;
}
- page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ vmf.virtual_address = (void __user *) start_addr;
+ vmf.pte = pte;
+ vmf.pgoff = pgoff;
+ vmf.max_pgoff = max_pgoff;
+ vmf.flags = flags;
+ vma->vm_ops->map_pages(vma, &vmf);
+}
+
+static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+{
+ struct page *fault_page;
+ spinlock_t *ptl;
+ pte_t *pte;
+ int ret = 0;
/*
- * This silly early PAGE_DIRTY setting removes a race
- * due to the bad i386 page protection. But it's valid
- * for other architectures too.
- *
- * Note that if FAULT_FLAG_WRITE is set, we either now have
- * an exclusive copy of the page, or this is a shared mapping,
- * so we can make it writable and dirty to avoid having to
- * handle that later.
+ * Let's call ->map_pages() first and use ->fault() as fallback
+ * if page by the offset is not ready to be mapped (cold cache or
+ * something).
*/
- /* Only go through if we didn't race with anybody else... */
- if (likely(pte_same(*page_table, orig_pte))) {
- flush_icache_page(vma, page);
- entry = mk_pte(page, vma->vm_page_prot);
- if (flags & FAULT_FLAG_WRITE)
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte))
- pte_mksoft_dirty(entry);
- if (anon) {
- inc_mm_counter_fast(mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
- } else {
- inc_mm_counter_fast(mm, MM_FILEPAGES);
- page_add_file_rmap(page);
- if (flags & FAULT_FLAG_WRITE) {
- dirty_page = page;
- get_page(dirty_page);
- }
- }
- set_pte_at(mm, address, page_table, entry);
+ if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
+ fault_around_pages() > 1) {
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ do_fault_around(vma, address, pte, pgoff, flags);
+ if (!pte_same(*pte, orig_pte))
+ goto unlock_out;
+ pte_unmap_unlock(pte, ptl);
+ }
- /* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, address, page_table);
- } else {
- if (cow_page)
- mem_cgroup_uncharge_page(cow_page);
- if (anon)
- page_cache_release(page);
- else
- anon = 1; /* no anon but release faulted_page */
+ ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ return ret;
+
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (unlikely(!pte_same(*pte, orig_pte))) {
+ pte_unmap_unlock(pte, ptl);
+ unlock_page(fault_page);
+ page_cache_release(fault_page);
+ return ret;
}
+ do_set_pte(vma, address, fault_page, pte, false, false);
+ unlock_page(fault_page);
+unlock_out:
+ pte_unmap_unlock(pte, ptl);
+ return ret;
+}
- pte_unmap_unlock(page_table, ptl);
+static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+{
+ struct page *fault_page, *new_page;
+ spinlock_t *ptl;
+ pte_t *pte;
+ int ret;
- if (dirty_page) {
- struct address_space *mapping = page->mapping;
- int dirtied = 0;
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
- if (set_page_dirty(dirty_page))
- dirtied = 1;
- unlock_page(dirty_page);
- put_page(dirty_page);
- if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping but still
- * dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!new_page)
+ return VM_FAULT_OOM;
- /* file_update_time outside page_lock */
- if (vma->vm_file && !page_mkwrite)
- file_update_time(vma->vm_file);
- } else {
- unlock_page(vmf.page);
- if (anon)
- page_cache_release(vmf.page);
+ if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) {
+ page_cache_release(new_page);
+ return VM_FAULT_OOM;
}
- return ret;
+ ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ goto uncharge_out;
-unwritable_page:
- page_cache_release(page);
+ copy_user_highpage(new_page, fault_page, address, vma);
+ __SetPageUptodate(new_page);
+
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (unlikely(!pte_same(*pte, orig_pte))) {
+ pte_unmap_unlock(pte, ptl);
+ unlock_page(fault_page);
+ page_cache_release(fault_page);
+ goto uncharge_out;
+ }
+ do_set_pte(vma, address, new_page, pte, true, true);
+ pte_unmap_unlock(pte, ptl);
+ unlock_page(fault_page);
+ page_cache_release(fault_page);
return ret;
uncharge_out:
- /* fs's fault handler get error */
- if (cow_page) {
- mem_cgroup_uncharge_page(cow_page);
- page_cache_release(cow_page);
+ mem_cgroup_uncharge_page(new_page);
+ page_cache_release(new_page);
+ return ret;
+}
+
+static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+{
+ struct page *fault_page;
+ struct address_space *mapping;
+ spinlock_t *ptl;
+ pte_t *pte;
+ int dirtied = 0;
+ int ret, tmp;
+
+ ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+ return ret;
+
+ /*
+ * Check if the backing address space wants to know that the page is
+ * about to become writable
+ */
+ if (vma->vm_ops->page_mkwrite) {
+ unlock_page(fault_page);
+ tmp = do_page_mkwrite(vma, fault_page, address);
+ if (unlikely(!tmp ||
+ (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+ page_cache_release(fault_page);
+ return tmp;
+ }
+ }
+
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ if (unlikely(!pte_same(*pte, orig_pte))) {
+ pte_unmap_unlock(pte, ptl);
+ unlock_page(fault_page);
+ page_cache_release(fault_page);
+ return ret;
+ }
+ do_set_pte(vma, address, fault_page, pte, true, false);
+ pte_unmap_unlock(pte, ptl);
+
+ if (set_page_dirty(fault_page))
+ dirtied = 1;
+ mapping = fault_page->mapping;
+ unlock_page(fault_page);
+ if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
+ /*
+ * Some device drivers do not set page.mapping but still
+ * dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
}
+
+ /* file_update_time outside page_lock */
+ if (vma->vm_file && !vma->vm_ops->page_mkwrite)
+ file_update_time(vma->vm_file);
+
return ret;
}
@@ -3492,7 +3024,13 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);
- return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+ if (!(flags & FAULT_FLAG_WRITE))
+ return do_read_fault(mm, vma, address, pmd, pgoff, flags,
+ orig_pte);
+ if (!(vma->vm_flags & VM_SHARED))
+ return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
+ orig_pte);
+ return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
/*
@@ -3524,10 +3062,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
pgoff = pte_to_pgoff(orig_pte);
- return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+ if (!(flags & FAULT_FLAG_WRITE))
+ return do_read_fault(mm, vma, address, pmd, pgoff, flags,
+ orig_pte);
+ if (!(vma->vm_flags & VM_SHARED))
+ return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
+ orig_pte);
+ return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
-int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid,
int *flags)
{
@@ -3542,7 +3086,7 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
return mpol_misplaced(page, vma, addr);
}
-int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
{
struct page *page = NULL;
@@ -3700,7 +3244,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
-retry:
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
@@ -3738,26 +3281,16 @@ retry:
if (dirty && !pmd_write(orig_pmd)) {
ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
orig_pmd);
- /*
- * If COW results in an oom, the huge pmd will
- * have been split, so retry the fault on the
- * pte for a smaller charge.
- */
- if (unlikely(ret & VM_FAULT_OOM))
- goto retry;
- return ret;
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
} else {
huge_pmd_set_accessed(mm, vma, address, pmd,
orig_pmd, dirty);
+ return 0;
}
-
- return 0;
}
}
- /* THP should already have been handled */
- BUG_ON(pmd_numa(*pmd));
-
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
@@ -4271,12 +3804,21 @@ void copy_user_huge_page(struct page *dst, struct page *src,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
-#if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS
+#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
+
+static struct kmem_cache *page_ptl_cachep;
+
+void __init ptlock_cache_init(void)
+{
+ page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
+ SLAB_PANIC, NULL);
+}
+
bool ptlock_alloc(struct page *page)
{
spinlock_t *ptl;
- ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
+ ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
if (!ptl)
return false;
page->ptl = ptl;
@@ -4285,6 +3827,6 @@ bool ptlock_alloc(struct page *page)
void ptlock_free(struct page *page)
{
- kfree(page->ptl);
+ kmem_cache_free(page_ptl_cachep, page->ptl);
}
#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 489f235502d..469bbf505f8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -9,7 +9,6 @@
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
-#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/pagevec.h>
@@ -47,19 +46,84 @@
static void generic_online_page(struct page *page);
static online_page_callback_t online_page_callback = generic_online_page;
+static DEFINE_MUTEX(online_page_callback_lock);
-DEFINE_MUTEX(mem_hotplug_mutex);
+/* The same as the cpu_hotplug lock, but for memory hotplug. */
+static struct {
+ struct task_struct *active_writer;
+ struct mutex lock; /* Synchronizes accesses to refcount, */
+ /*
+ * Also blocks the new readers during
+ * an ongoing mem hotplug operation.
+ */
+ int refcount;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+} mem_hotplug = {
+ .active_writer = NULL,
+ .lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
+ .refcount = 0,
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ .dep_map = {.name = "mem_hotplug.lock" },
+#endif
+};
+
+/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
+#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
+#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
+#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+
+void get_online_mems(void)
+{
+ might_sleep();
+ if (mem_hotplug.active_writer == current)
+ return;
+ memhp_lock_acquire_read();
+ mutex_lock(&mem_hotplug.lock);
+ mem_hotplug.refcount++;
+ mutex_unlock(&mem_hotplug.lock);
+
+}
-void lock_memory_hotplug(void)
+void put_online_mems(void)
{
- mutex_lock(&mem_hotplug_mutex);
+ if (mem_hotplug.active_writer == current)
+ return;
+ mutex_lock(&mem_hotplug.lock);
+
+ if (WARN_ON(!mem_hotplug.refcount))
+ mem_hotplug.refcount++; /* try to fix things up */
+
+ if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
+ wake_up_process(mem_hotplug.active_writer);
+ mutex_unlock(&mem_hotplug.lock);
+ memhp_lock_release();
+
}
-void unlock_memory_hotplug(void)
+static void mem_hotplug_begin(void)
{
- mutex_unlock(&mem_hotplug_mutex);
+ mem_hotplug.active_writer = current;
+
+ memhp_lock_acquire();
+ for (;;) {
+ mutex_lock(&mem_hotplug.lock);
+ if (likely(!mem_hotplug.refcount))
+ break;
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&mem_hotplug.lock);
+ schedule();
+ }
}
+static void mem_hotplug_done(void)
+{
+ mem_hotplug.active_writer = NULL;
+ mutex_unlock(&mem_hotplug.lock);
+ memhp_lock_release();
+}
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
@@ -269,7 +333,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
}
/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
- * alloc_bootmem_node_nopanic() */
+ * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
static int __ref ensure_zone_is_initialized(struct zone *zone,
unsigned long start_pfn, unsigned long num_pages)
{
@@ -728,14 +792,16 @@ int set_online_page_callback(online_page_callback_t callback)
{
int rc = -EINVAL;
- lock_memory_hotplug();
+ get_online_mems();
+ mutex_lock(&online_page_callback_lock);
if (online_page_callback == generic_online_page) {
online_page_callback = callback;
rc = 0;
}
- unlock_memory_hotplug();
+ mutex_unlock(&online_page_callback_lock);
+ put_online_mems();
return rc;
}
@@ -745,14 +811,16 @@ int restore_online_page_callback(online_page_callback_t callback)
{
int rc = -EINVAL;
- lock_memory_hotplug();
+ get_online_mems();
+ mutex_lock(&online_page_callback_lock);
if (online_page_callback == callback) {
online_page_callback = generic_online_page;
rc = 0;
}
- unlock_memory_hotplug();
+ mutex_unlock(&online_page_callback_lock);
+ put_online_mems();
return rc;
}
@@ -900,7 +968,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
int ret;
struct memory_notify arg;
- lock_memory_hotplug();
+ mem_hotplug_begin();
/*
* This doesn't need a lock to do pfn_to_page().
* The section can't be removed here because of the
@@ -908,23 +976,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
*/
zone = page_zone(pfn_to_page(pfn));
+ ret = -EINVAL;
if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
- !can_online_high_movable(zone)) {
- unlock_memory_hotplug();
- return -EINVAL;
- }
+ !can_online_high_movable(zone))
+ goto out;
if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
- if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
- unlock_memory_hotplug();
- return -EINVAL;
- }
+ if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
+ goto out;
}
if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
- if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
- unlock_memory_hotplug();
- return -EINVAL;
- }
+ if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
+ goto out;
}
/* Previous code may changed the zone of the pfn range */
@@ -940,8 +1003,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
ret = notifier_to_errno(ret);
if (ret) {
memory_notify(MEM_CANCEL_ONLINE, &arg);
- unlock_memory_hotplug();
- return ret;
+ goto out;
}
/*
* If this zone is not populated, then it is not in zonelist.
@@ -965,8 +1027,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
(((unsigned long long) pfn + nr_pages)
<< PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_ONLINE, &arg);
- unlock_memory_hotplug();
- return ret;
+ goto out;
}
zone->present_pages += onlined_pages;
@@ -996,9 +1057,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (onlined_pages)
memory_notify(MEM_ONLINE, &arg);
- unlock_memory_hotplug();
-
- return 0;
+out:
+ mem_hotplug_done();
+ return ret;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -1008,7 +1069,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
struct pglist_data *pgdat;
unsigned long zones_size[MAX_NR_ZONES] = {0};
unsigned long zholes_size[MAX_NR_ZONES] = {0};
- unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long start_pfn = PFN_DOWN(start);
pgdat = NODE_DATA(nid);
if (!pgdat) {
@@ -1056,7 +1117,7 @@ int try_online_node(int nid)
if (node_online(nid))
return 0;
- lock_memory_hotplug();
+ mem_hotplug_begin();
pgdat = hotadd_new_pgdat(nid, 0);
if (!pgdat) {
pr_err("Cannot online node %d due to NULL pgdat\n", nid);
@@ -1074,13 +1135,13 @@ int try_online_node(int nid)
}
out:
- unlock_memory_hotplug();
+ mem_hotplug_done();
return ret;
}
static int check_hotplug_memory_range(u64 start, u64 size)
{
- u64 start_pfn = start >> PAGE_SHIFT;
+ u64 start_pfn = PFN_DOWN(start);
u64 nr_pages = size >> PAGE_SHIFT;
/* Memory range must be aligned with section */
@@ -1108,17 +1169,18 @@ int __ref add_memory(int nid, u64 start, u64 size)
if (ret)
return ret;
- lock_memory_hotplug();
-
res = register_memory_resource(start, size);
ret = -EEXIST;
if (!res)
- goto out;
+ return ret;
{ /* Stupid hack to suppress address-never-null warning */
void *p = NODE_DATA(nid);
new_pgdat = !p;
}
+
+ mem_hotplug_begin();
+
new_node = !node_online(nid);
if (new_node) {
pgdat = hotadd_new_pgdat(nid, start);
@@ -1158,7 +1220,7 @@ error:
release_memory_resource(res);
out:
- unlock_memory_hotplug();
+ mem_hotplug_done();
return ret;
}
EXPORT_SYMBOL_GPL(add_memory);
@@ -1310,7 +1372,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
#ifdef CONFIG_DEBUG_VM
printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
pfn);
- dump_page(page);
+ dump_page(page, "failed to remove from LRU");
#endif
put_page(page);
/* Because we don't have big zone->lock. we should
@@ -1332,7 +1394,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* alloc_migrate_target should be improooooved!!
* migrate_pages returns # of failed pages.
*/
- ret = migrate_pages(&source, alloc_migrate_target, 0,
+ ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
if (ret)
putback_movable_pages(&source);
@@ -1446,6 +1508,7 @@ static int __init cmdline_parse_movable_node(char *p)
* the kernel away from hotpluggable memory.
*/
memblock_set_bottom_up(true);
+ movable_node_enabled = true;
#else
pr_warn("movable_node option not supported\n");
#endif
@@ -1564,7 +1627,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
if (!test_pages_in_a_zone(start_pfn, end_pfn))
return -EINVAL;
- lock_memory_hotplug();
+ mem_hotplug_begin();
zone = page_zone(pfn_to_page(start_pfn));
node = zone_to_nid(zone);
@@ -1671,7 +1734,7 @@ repeat:
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
- unlock_memory_hotplug();
+ mem_hotplug_done();
return 0;
failed_removal:
@@ -1683,7 +1746,7 @@ failed_removal:
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
out:
- unlock_memory_hotplug();
+ mem_hotplug_done();
return ret;
}
@@ -1887,7 +1950,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
BUG_ON(check_hotplug_memory_range(start, size));
- lock_memory_hotplug();
+ mem_hotplug_begin();
/*
* All memory blocks must be offlined before removing memory. Check
@@ -1896,10 +1959,8 @@ void __ref remove_memory(int nid, u64 start, u64 size)
*/
ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
check_memblock_offlined_cb);
- if (ret) {
- unlock_memory_hotplug();
+ if (ret)
BUG();
- }
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
@@ -1908,7 +1969,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
try_offline_node(nid);
- unlock_memory_hotplug();
+ mem_hotplug_done();
}
EXPORT_SYMBOL_GPL(remove_memory);
#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eca4a312912..8f5330d74f4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -65,6 +65,8 @@
kernel is not always grateful with that.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/highmem.h>
@@ -91,6 +93,7 @@
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
+#include <linux/printk.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
@@ -526,9 +529,13 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
int nid;
struct page *page;
spinlock_t *ptl;
+ pte_t entry;
ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
- page = pte_page(huge_ptep_get((pte_t *)pmd));
+ entry = huge_ptep_get((pte_t *)pmd);
+ if (!pte_present(entry))
+ goto unlock;
+ page = pte_page(entry);
nid = page_to_nid(page);
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
goto unlock;
@@ -613,7 +620,7 @@ static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
return 0;
}
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+#ifdef CONFIG_NUMA_BALANCING
/*
* This is used to mark a range of virtual addresses to be inaccessible.
* These are later cleared by a NUMA hinting fault. Depending on these
@@ -627,7 +634,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
int nr_updated;
- BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
if (nr_updated)
@@ -641,7 +647,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
{
return 0;
}
-#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+#endif /* CONFIG_NUMA_BALANCING */
/*
* Walk through page tables and collect pages to be migrated.
@@ -650,19 +656,18 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
* @nodes and @flags,) it's isolated and queued to the pagelist which is
* passed via @private.)
*/
-static struct vm_area_struct *
+static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
const nodemask_t *nodes, unsigned long flags, void *private)
{
- int err;
- struct vm_area_struct *first, *vma, *prev;
-
+ int err = 0;
+ struct vm_area_struct *vma, *prev;
- first = find_vma(mm, start);
- if (!first)
- return ERR_PTR(-EFAULT);
+ vma = find_vma(mm, start);
+ if (!vma)
+ return -EFAULT;
prev = NULL;
- for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+ for (; vma && vma->vm_start < end; vma = vma->vm_next) {
unsigned long endvma = vma->vm_end;
if (endvma > end)
@@ -672,9 +677,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
if (!vma->vm_next && vma->vm_end < end)
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
if (prev && prev->vm_end < vma->vm_start)
- return ERR_PTR(-EFAULT);
+ return -EFAULT;
}
if (flags & MPOL_MF_LAZY) {
@@ -688,15 +693,13 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
err = queue_pages_pgd_range(vma, start, endvma, nodes,
flags, private);
- if (err) {
- first = ERR_PTR(err);
+ if (err)
break;
- }
}
next:
prev = vma;
}
- return first;
+ return err;
}
/*
@@ -796,36 +799,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
return err;
}
-/*
- * Update task->flags PF_MEMPOLICY bit: set iff non-default
- * mempolicy. Allows more rapid checking of this (combined perhaps
- * with other PF_* flag bits) on memory allocation hot code paths.
- *
- * If called from outside this file, the task 'p' should -only- be
- * a newly forked child not yet visible on the task list, because
- * manipulating the task flags of a visible task is not safe.
- *
- * The above limitation is why this routine has the funny name
- * mpol_fix_fork_child_flag().
- *
- * It is also safe to call this with a task pointer of current,
- * which the static wrapper mpol_set_task_struct_flag() does,
- * for use within this file.
- */
-
-void mpol_fix_fork_child_flag(struct task_struct *p)
-{
- if (p->mempolicy)
- p->flags |= PF_MEMPOLICY;
- else
- p->flags &= ~PF_MEMPOLICY;
-}
-
-static void mpol_set_task_struct_flag(void)
-{
- mpol_fix_fork_child_flag(current);
-}
-
/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
@@ -862,7 +835,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
}
old = current->mempolicy;
current->mempolicy = new;
- mpol_set_task_struct_flag();
if (new && new->mode == MPOL_INTERLEAVE &&
nodes_weight(new->v.nodes))
current->il_next = first_node(new->v.nodes);
@@ -1060,7 +1032,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, new_node_page, dest,
+ err = migrate_pages(&pagelist, new_node_page, NULL, dest,
MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
@@ -1181,30 +1153,31 @@ out:
/*
* Allocate a new page for page migration based on vma policy.
- * Start assuming that page is mapped by vma pointed to by @private.
+ * Start by assuming the page is mapped by the same vma as contains @start.
* Search forward from there, if not. N.B., this assumes that the
* list of pages handed to migrate_pages()--which is how we get here--
* is in virtual address order.
*/
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
{
- struct vm_area_struct *vma = (struct vm_area_struct *)private;
+ struct vm_area_struct *vma;
unsigned long uninitialized_var(address);
+ vma = find_vma(current->mm, start);
while (vma) {
address = page_address_in_vma(page, vma);
if (address != -EFAULT)
break;
vma = vma->vm_next;
}
- /*
- * queue_pages_range() confirms that @page belongs to some vma,
- * so vma shouldn't be NULL.
- */
- BUG_ON(!vma);
- if (PageHuge(page))
+ if (PageHuge(page)) {
+ BUG_ON(!vma);
return alloc_huge_page_noerr(vma, address, 1);
+ }
+ /*
+ * if !vma, alloc_page_vma() will use task or system default policy
+ */
return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
}
#else
@@ -1220,7 +1193,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
return -ENOSYS;
}
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
{
return NULL;
}
@@ -1230,7 +1203,6 @@ static long do_mbind(unsigned long start, unsigned long len,
unsigned short mode, unsigned short mode_flags,
nodemask_t *nmask, unsigned long flags)
{
- struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
struct mempolicy *new;
unsigned long end;
@@ -1296,11 +1268,9 @@ static long do_mbind(unsigned long start, unsigned long len,
if (err)
goto mpol_out;
- vma = queue_pages_range(mm, start, end, nmask,
+ err = queue_pages_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
-
- err = PTR_ERR(vma); /* maybe ... */
- if (!IS_ERR(vma))
+ if (!err)
err = mbind_range(mm, start, end, new);
if (!err) {
@@ -1308,9 +1278,8 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
- nr_failed = migrate_pages(&pagelist, new_vma_page,
- (unsigned long)vma,
- MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+ nr_failed = migrate_pages(&pagelist, new_page, NULL,
+ start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
if (nr_failed)
putback_movable_pages(&pagelist);
}
@@ -1318,7 +1287,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
} else
- putback_lru_pages(&pagelist);
+ putback_movable_pages(&pagelist);
up_write(&mm->mmap_sem);
mpol_out:
@@ -1394,7 +1363,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
}
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
- unsigned long, mode, unsigned long __user *, nmask,
+ unsigned long, mode, const unsigned long __user *, nmask,
unsigned long, maxnode, unsigned, flags)
{
nodemask_t nodes;
@@ -1415,7 +1384,7 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
}
/* Set the process memory policy */
-SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
+SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
unsigned long, maxnode)
{
int err;
@@ -1557,10 +1526,10 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
#ifdef CONFIG_COMPAT
-asmlinkage long compat_sys_get_mempolicy(int __user *policy,
- compat_ulong_t __user *nmask,
- compat_ulong_t maxnode,
- compat_ulong_t addr, compat_ulong_t flags)
+COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
+ compat_ulong_t __user *, nmask,
+ compat_ulong_t, maxnode,
+ compat_ulong_t, addr, compat_ulong_t, flags)
{
long err;
unsigned long __user *nm = NULL;
@@ -1587,8 +1556,8 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy,
return err;
}
-asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
- compat_ulong_t maxnode)
+COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
+ compat_ulong_t, maxnode)
{
long err = 0;
unsigned long __user *nm = NULL;
@@ -1610,9 +1579,9 @@ asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
return sys_set_mempolicy(mode, nm, nr_bits+1);
}
-asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
- compat_ulong_t mode, compat_ulong_t __user *nmask,
- compat_ulong_t maxnode, compat_ulong_t flags)
+COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
+ compat_ulong_t, mode, compat_ulong_t __user *, nmask,
+ compat_ulong_t, maxnode, compat_ulong_t, flags)
{
long err = 0;
unsigned long __user *nm = NULL;
@@ -1638,9 +1607,9 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
/*
* get_vma_policy(@task, @vma, @addr)
- * @task - task for fallback if vma policy == default
- * @vma - virtual memory area whose policy is sought
- * @addr - address in @vma for shared policy lookup
+ * @task: task for fallback if vma policy == default
+ * @vma: virtual memory area whose policy is sought
+ * @addr: address in @vma for shared policy lookup
*
* Returns effective policy for a VMA at specified address.
* Falls back to @task or system default policy, as necessary.
@@ -1783,21 +1752,18 @@ static unsigned interleave_nodes(struct mempolicy *policy)
/*
* Depending on the memory policy provide a node from which to allocate the
* next slab entry.
- * @policy must be protected by freeing by the caller. If @policy is
- * the current task's mempolicy, this protection is implicit, as only the
- * task can change it's policy. The system default policy requires no
- * such protection.
*/
-unsigned slab_node(void)
+unsigned int mempolicy_slab_node(void)
{
struct mempolicy *policy;
+ int node = numa_mem_id();
if (in_interrupt())
- return numa_node_id();
+ return node;
policy = current->mempolicy;
if (!policy || policy->flags & MPOL_F_LOCAL)
- return numa_node_id();
+ return node;
switch (policy->mode) {
case MPOL_PREFERRED:
@@ -1817,11 +1783,11 @@ unsigned slab_node(void)
struct zonelist *zonelist;
struct zone *zone;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
- zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
+ zonelist = &NODE_DATA(node)->node_zonelists[0];
(void)first_zones_zonelist(zonelist, highest_zoneidx,
&policy->v.nodes,
&zone);
- return zone ? zone->node : numa_node_id();
+ return zone ? zone->node : node;
}
default:
@@ -1889,18 +1855,18 @@ int node_random(const nodemask_t *maskp)
#ifdef CONFIG_HUGETLBFS
/*
* huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
- * @vma = virtual memory area whose policy is sought
- * @addr = address in @vma for shared policy lookup and interleave policy
- * @gfp_flags = for requested zone
- * @mpol = pointer to mempolicy pointer for reference counted mempolicy
- * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
+ * @vma: virtual memory area whose policy is sought
+ * @addr: address in @vma for shared policy lookup and interleave policy
+ * @gfp_flags: for requested zone
+ * @mpol: pointer to mempolicy pointer for reference counted mempolicy
+ * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
*
* Returns a zonelist suitable for a huge page allocation and a pointer
* to the struct mempolicy for conditional unref after allocation.
* If the effective policy is 'BIND, returns a pointer to the mempolicy's
* @nodemask for filtering the zonelist.
*
- * Must be protected by get_mems_allowed()
+ * Must be protected by read_mems_allowed_begin()
*/
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
gfp_t gfp_flags, struct mempolicy **mpol,
@@ -2064,7 +2030,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
retry_cpuset:
pol = get_vma_policy(current, vma, addr);
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
@@ -2072,7 +2038,7 @@ retry_cpuset:
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
@@ -2082,7 +2048,7 @@ retry_cpuset:
policy_nodemask(gfp, pol));
if (unlikely(mpol_needs_cond_ref(pol)))
__mpol_put(pol);
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
}
@@ -2116,7 +2082,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
pol = &default_policy;
retry_cpuset:
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
/*
* No reference counting needed for current->mempolicy
@@ -2129,7 +2095,7 @@ retry_cpuset:
policy_zonelist(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
@@ -2173,7 +2139,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
} else
*new = *old;
- rcu_read_lock();
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
if (new->flags & MPOL_F_REBINDING)
@@ -2181,7 +2146,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
else
mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
}
- rcu_read_unlock();
atomic_set(&new->refcnt, 1);
return new;
}
@@ -2302,41 +2266,12 @@ static void sp_free(struct sp_node *n)
kmem_cache_free(sn_cache, n);
}
-#ifdef CONFIG_NUMA_BALANCING
-static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
-{
- /* Never defer a private fault */
- if (cpupid_match_pid(p, last_cpupid))
- return false;
-
- if (p->numa_migrate_deferred) {
- p->numa_migrate_deferred--;
- return true;
- }
- return false;
-}
-
-static inline void defer_numa_migrate(struct task_struct *p)
-{
- p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
-}
-#else
-static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
-{
- return false;
-}
-
-static inline void defer_numa_migrate(struct task_struct *p)
-{
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
/**
* mpol_misplaced - check whether current page node is valid in policy
*
- * @page - page to be checked
- * @vma - vm area where page mapped
- * @addr - virtual address where page mapped
+ * @page: page to be checked
+ * @vma: vm area where page mapped
+ * @addr: virtual address where page mapped
*
* Lookup current policy node id for vma,addr and "compare to" page's
* node id.
@@ -2404,52 +2339,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
/* Migrate the page towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
- int last_cpupid;
- int this_cpupid;
-
polnid = thisnid;
- this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
- /*
- * Multi-stage node selection is used in conjunction
- * with a periodic migration fault to build a temporal
- * task<->page relation. By using a two-stage filter we
- * remove short/unlikely relations.
- *
- * Using P(p) ~ n_p / n_t as per frequentist
- * probability, we can equate a task's usage of a
- * particular page (n_p) per total usage of this
- * page (n_t) (in a given time-span) to a probability.
- *
- * Our periodic faults will sample this probability and
- * getting the same result twice in a row, given these
- * samples are fully independent, is then given by
- * P(n)^2, provided our sample period is sufficiently
- * short compared to the usage pattern.
- *
- * This quadric squishes small probabilities, making
- * it less likely we act on an unlikely task<->page
- * relation.
- */
- last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
- if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
-
- /* See sysctl_numa_balancing_migrate_deferred comment */
- if (!cpupid_match_pid(current, last_cpupid))
- defer_numa_migrate(current);
-
- goto out;
- }
-
- /*
- * The quadratic filter above reduces extraneous migration
- * of shared pages somewhat. This code reduces it even more,
- * reducing the overhead of page migrations of shared pages.
- * This makes workloads with shared pages rely more on
- * "move task near its memory", and less on "move memory
- * towards its task", which is exactly what we want.
- */
- if (numa_migrate_deferred(current, last_cpupid))
+ if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
goto out;
}
@@ -2655,7 +2547,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
}
#ifdef CONFIG_NUMA_BALANCING
-static bool __initdata numabalancing_override;
+static int __initdata numabalancing_override;
static void __init check_numabalancing_enable(void)
{
@@ -2664,9 +2556,15 @@ static void __init check_numabalancing_enable(void)
if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
numabalancing_default = true;
+ /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
+ if (numabalancing_override)
+ set_numabalancing_state(numabalancing_override == 1);
+
if (nr_node_ids > 1 && !numabalancing_override) {
- printk(KERN_INFO "Enabling automatic NUMA balancing. "
- "Configure with numa_balancing= or sysctl");
+ pr_info("%s automatic NUMA balancing. "
+ "Configure with numa_balancing= or the "
+ "kernel.numa_balancing sysctl",
+ numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}
}
@@ -2676,18 +2574,17 @@ static int __init setup_numabalancing(char *str)
int ret = 0;
if (!str)
goto out;
- numabalancing_override = true;
if (!strcmp(str, "enable")) {
- set_numabalancing_state(true);
+ numabalancing_override = 1;
ret = 1;
} else if (!strcmp(str, "disable")) {
- set_numabalancing_state(false);
+ numabalancing_override = -1;
ret = 1;
}
out:
if (!ret)
- printk(KERN_WARNING "Unable to parse numa_balancing=\n");
+ pr_warn("Unable to parse numa_balancing=\n");
return ret;
}
@@ -2747,7 +2644,7 @@ void __init numa_policy_init(void)
node_set(prefer, interleave_nodes);
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
- printk("numa_policy_init: interleaving failed\n");
+ pr_err("%s: interleaving failed\n", __func__);
check_numabalancing_enable();
}
@@ -2926,7 +2823,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
unsigned short mode = MPOL_DEFAULT;
unsigned short flags = 0;
- if (pol && pol != &default_policy) {
+ if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
mode = pol->mode;
flags = pol->flags;
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 659aa42bad1..e209c98c720 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -10,6 +10,7 @@
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/kmemleak.h>
#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
@@ -192,6 +193,7 @@ EXPORT_SYMBOL(mempool_resize);
* returns NULL. Note that due to preallocation, this function
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
+ * Note: using __GFP_ZERO is not supported.
*/
void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
@@ -200,6 +202,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
wait_queue_t wait;
gfp_t gfp_temp;
+ VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
might_sleep_if(gfp_mask & __GFP_WAIT);
gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
@@ -220,6 +223,11 @@ repeat_alloc:
spin_unlock_irqrestore(&pool->lock, flags);
/* paired with rmb in mempool_free(), read comment there */
smp_wmb();
+ /*
+ * Update the allocation stack trace as this is more useful
+ * for debugging.
+ */
+ kmemleak_update_trace(element);
return element;
}
@@ -304,9 +312,9 @@ void mempool_free(void *element, mempool_t *pool)
* ensures that there will be frees which return elements to the
* pool waking up the waiters.
*/
- if (pool->curr_nr < pool->min_nr) {
+ if (unlikely(pool->curr_nr < pool->min_nr)) {
spin_lock_irqsave(&pool->lock, flags);
- if (pool->curr_nr < pool->min_nr) {
+ if (likely(pool->curr_nr < pool->min_nr)) {
add_element(pool, element);
spin_unlock_irqrestore(&pool->lock, flags);
wake_up(&pool->wait);
diff --git a/mm/migrate.c b/mm/migrate.c
index bb940045fe8..be6dbf995c0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,6 +36,7 @@
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
#include <linux/balloon_compaction.h>
+#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
@@ -71,28 +72,12 @@ int migrate_prep_local(void)
}
/*
- * Add isolated pages on the list back to the LRU under page lock
- * to avoid leaking evictable pages back onto unevictable list.
- */
-void putback_lru_pages(struct list_head *l)
-{
- struct page *page;
- struct page *page2;
-
- list_for_each_entry_safe(page, page2, l, lru) {
- list_del(&page->lru);
- dec_zone_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
- putback_lru_page(page);
- }
-}
-
-/*
* Put previously isolated pages back onto the appropriate lists
* from where they were once taken off for compaction/migration.
*
- * This function shall be used instead of putback_lru_pages(),
- * whenever the isolated pageset has been built by isolate_migratepages_range()
+ * This function shall be used whenever the isolated pageset has been
+ * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
+ * and isolate_huge_page().
*/
void putback_movable_pages(struct list_head *l)
{
@@ -135,8 +120,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pmd = mm_find_pmd(mm, addr);
if (!pmd)
goto out;
- if (pmd_trans_huge(*pmd))
- goto out;
ptep = pte_offset_map(pmd, addr);
@@ -193,12 +176,49 @@ out:
}
/*
+ * Congratulations to trinity for discovering this bug.
+ * mm/fremap.c's remap_file_pages() accepts any range within a single vma to
+ * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then
+ * replace the specified range by file ptes throughout (maybe populated after).
+ * If page migration finds a page within that range, while it's still located
+ * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem:
+ * zap_pte() clears the temporary migration entry before mmap_sem is dropped.
+ * But if the migrating page is in a part of the vma outside the range to be
+ * remapped, then it will not be cleared, and remove_migration_ptes() needs to
+ * deal with it. Fortunately, this part of the vma is of course still linear,
+ * so we just need to use linear location on the nonlinear list.
+ */
+static int remove_linear_migration_ptes_from_nonlinear(struct page *page,
+ struct address_space *mapping, void *arg)
+{
+ struct vm_area_struct *vma;
+ /* hugetlbfs does not support remap_pages, so no huge pgoff worries */
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ unsigned long addr;
+
+ list_for_each_entry(vma,
+ &mapping->i_mmap_nonlinear, shared.nonlinear) {
+
+ addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ if (addr >= vma->vm_start && addr < vma->vm_end)
+ remove_migration_pte(page, vma, addr, arg);
+ }
+ return SWAP_AGAIN;
+}
+
+/*
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
static void remove_migration_ptes(struct page *old, struct page *new)
{
- rmap_walk(new, remove_migration_pte, old);
+ struct rmap_walk_control rwc = {
+ .rmap_one = remove_migration_pte,
+ .arg = old,
+ .file_nonlinear = remove_linear_migration_ptes_from_nonlinear,
+ };
+
+ rmap_walk(new, &rwc);
}
/*
@@ -316,14 +336,15 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
*/
int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page,
- struct buffer_head *head, enum migrate_mode mode)
+ struct buffer_head *head, enum migrate_mode mode,
+ int extra_count)
{
- int expected_count = 0;
+ int expected_count = 1 + extra_count;
void **pslot;
if (!mapping) {
/* Anonymous page without mapping */
- if (page_count(page) != 1)
+ if (page_count(page) != expected_count)
return -EAGAIN;
return MIGRATEPAGE_SUCCESS;
}
@@ -333,7 +354,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
pslot = radix_tree_lookup_slot(&mapping->page_tree,
page_index(page));
- expected_count = 2 + page_has_private(page);
+ expected_count += 1 + page_has_private(page);
if (page_count(page) != expected_count ||
radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
spin_unlock_irq(&mapping->tree_lock);
@@ -508,7 +529,7 @@ void migrate_page_copy(struct page *newpage, struct page *page)
if (PageUptodate(page))
SetPageUptodate(newpage);
if (TestClearPageActive(page)) {
- VM_BUG_ON(PageUnevictable(page));
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
SetPageActive(newpage);
} else if (TestClearPageUnevictable(page))
SetPageUnevictable(newpage);
@@ -561,14 +582,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
* Migration functions
***********************************************************/
-/* Always fail migration. Used for mappings that are not movable */
-int fail_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page)
-{
- return -EIO;
-}
-EXPORT_SYMBOL(fail_migrate_page);
-
/*
* Common logic to directly migrate a single page suitable for
* pages that do not use PagePrivate/PagePrivate2.
@@ -583,7 +596,7 @@ int migrate_page(struct address_space *mapping,
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
+ rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
@@ -610,7 +623,7 @@ int buffer_migrate_page(struct address_space *mapping,
head = page_buffers(page);
- rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
+ rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
@@ -888,7 +901,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* free the metadata, so the page can be freed.
*/
if (!page->mapping) {
- VM_BUG_ON(PageAnon(page));
+ VM_BUG_ON_PAGE(PageAnon(page), page);
if (page_has_private(page)) {
try_to_free_buffers(page);
goto uncharge;
@@ -923,8 +936,9 @@ out:
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
*/
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
- struct page *page, int force, enum migrate_mode mode)
+static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
+ unsigned long private, struct page *page, int force,
+ enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
@@ -968,11 +982,18 @@ out:
page_is_file_cache(page));
putback_lru_page(page);
}
+
/*
- * Move the new page to the LRU. If migration was not successful
- * then this will free the page.
+ * If migration was not successful and there's a freeing callback, use
+ * it. Otherwise, putback_lru_page() will drop the reference grabbed
+ * during isolation.
*/
- putback_lru_page(newpage);
+ if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
+ ClearPageSwapBacked(newpage);
+ put_new_page(newpage, private);
+ } else
+ putback_lru_page(newpage);
+
if (result) {
if (rc)
*result = rc;
@@ -1001,12 +1022,13 @@ out:
* will wait in the page fault for migration to complete.
*/
static int unmap_and_move_huge_page(new_page_t get_new_page,
- unsigned long private, struct page *hpage,
- int force, enum migrate_mode mode)
+ free_page_t put_new_page, unsigned long private,
+ struct page *hpage, int force,
+ enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
- struct page *new_hpage = get_new_page(hpage, private, &result);
+ struct page *new_hpage;
struct anon_vma *anon_vma = NULL;
/*
@@ -1016,9 +1038,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
* tables or check whether the hugepage is pmd-based or not before
* kicking migration.
*/
- if (!hugepage_migration_support(page_hstate(hpage)))
+ if (!hugepage_migration_supported(page_hstate(hpage))) {
+ putback_active_hugepage(hpage);
return -ENOSYS;
+ }
+ new_hpage = get_new_page(hpage, private, &result);
if (!new_hpage)
return -ENOMEM;
@@ -1038,20 +1063,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (!page_mapped(hpage))
rc = move_to_new_page(new_hpage, hpage, 1, mode);
- if (rc)
+ if (rc != MIGRATEPAGE_SUCCESS)
remove_migration_ptes(hpage, hpage);
if (anon_vma)
put_anon_vma(anon_vma);
- if (!rc)
+ if (rc == MIGRATEPAGE_SUCCESS)
hugetlb_cgroup_migrate(hpage, new_hpage);
unlock_page(hpage);
out:
if (rc != -EAGAIN)
putback_active_hugepage(hpage);
- put_page(new_hpage);
+
+ /*
+ * If migration was not successful and there's a freeing callback, use
+ * it. Otherwise, put_page() will drop the reference grabbed during
+ * isolation.
+ */
+ if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+ put_new_page(new_hpage, private);
+ else
+ put_page(new_hpage);
+
if (result) {
if (rc)
*result = rc;
@@ -1068,6 +1103,8 @@ out:
* @from: The list of pages to be migrated.
* @get_new_page: The function used to allocate free pages to be used
* as the target of the page migration.
+ * @put_new_page: The function used to free target pages if migration
+ * fails, or NULL if no special handling is necessary.
* @private: Private data to be passed on to get_new_page()
* @mode: The migration mode that specifies the constraints for
* page migration, if any.
@@ -1081,7 +1118,8 @@ out:
* Returns the number of pages that were not migrated, or an error code.
*/
int migrate_pages(struct list_head *from, new_page_t get_new_page,
- unsigned long private, enum migrate_mode mode, int reason)
+ free_page_t put_new_page, unsigned long private,
+ enum migrate_mode mode, int reason)
{
int retry = 1;
int nr_failed = 0;
@@ -1103,10 +1141,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
if (PageHuge(page))
rc = unmap_and_move_huge_page(get_new_page,
- private, page, pass > 2, mode);
+ put_new_page, private, page,
+ pass > 2, mode);
else
- rc = unmap_and_move(get_new_page, private,
- page, pass > 2, mode);
+ rc = unmap_and_move(get_new_page, put_new_page,
+ private, page, pass > 2, mode);
switch(rc) {
case -ENOMEM:
@@ -1118,7 +1157,12 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
nr_succeeded++;
break;
default:
- /* Permanent failure */
+ /*
+ * Permanent failure (-EBUSY, -ENOSYS, etc.):
+ * unlike -EAGAIN case, the failed page is
+ * removed from migration page list and not
+ * retried in the next outer loop.
+ */
nr_failed++;
break;
}
@@ -1167,7 +1211,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
pm->node);
else
return alloc_pages_exact_node(pm->node,
- GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
+ GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
}
/*
@@ -1250,7 +1294,7 @@ set_status:
err = 0;
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, new_page_node,
+ err = migrate_pages(&pagelist, new_page_node, NULL,
(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
@@ -1553,12 +1597,10 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
struct page *newpage;
newpage = alloc_pages_exact_node(nid,
- (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
- __GFP_NOMEMALLOC | __GFP_NORETRY |
- __GFP_NOWARN) &
+ (GFP_HIGHUSER_MOVABLE |
+ __GFP_THISNODE | __GFP_NOMEMALLOC |
+ __GFP_NORETRY | __GFP_NOWARN) &
~GFP_IOFS, 0);
- if (newpage)
- page_cpupid_xchg_last(newpage, page_cpupid_last(page));
return newpage;
}
@@ -1592,35 +1634,42 @@ bool migrate_ratelimited(int node)
}
/* Returns true if the node is migrate rate-limited after the update */
-bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
+static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
+ unsigned long nr_pages)
{
- bool rate_limited = false;
-
/*
* Rate-limit the amount of data that is being migrated to a node.
* Optimal placement is no good if the memory bus is saturated and
* all the time is being spent migrating!
*/
- spin_lock(&pgdat->numabalancing_migrate_lock);
if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
+ spin_lock(&pgdat->numabalancing_migrate_lock);
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies +
msecs_to_jiffies(migrate_interval_millisecs);
+ spin_unlock(&pgdat->numabalancing_migrate_lock);
}
- if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
- rate_limited = true;
- else
- pgdat->numabalancing_migrate_nr_pages += nr_pages;
- spin_unlock(&pgdat->numabalancing_migrate_lock);
-
- return rate_limited;
+ if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
+ trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
+ nr_pages);
+ return true;
+ }
+
+ /*
+ * This is an unlocked non-atomic update so errors are possible.
+ * The consequences are failing to migrate when we potentiall should
+ * have which is not severe enough to warrant locking. If it is ever
+ * a problem, it can be converted to a per-cpu counter.
+ */
+ pgdat->numabalancing_migrate_nr_pages += nr_pages;
+ return false;
}
-int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
int page_lru;
- VM_BUG_ON(compound_order(page) && !PageTransHuge(page));
+ VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
/* Avoid migrating to a node that is nearly full */
if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
@@ -1654,6 +1703,18 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
return 1;
}
+bool pmd_trans_migrating(pmd_t pmd)
+{
+ struct page *page = pmd_page(pmd);
+ return PageLocked(page);
+}
+
+void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
+{
+ struct page *page = pmd_page(*pmd);
+ wait_on_page_locked(page);
+}
+
/*
* Attempt to migrate a misplaced page to the specified destination
* node. Caller is expected to have an elevated reference count on
@@ -1689,9 +1750,15 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
list_add(&page->lru, &migratepages);
nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
- node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+ NULL, node, MIGRATE_ASYNC,
+ MR_NUMA_MISPLACED);
if (nr_remaining) {
- putback_lru_pages(&migratepages);
+ if (!list_empty(&migratepages)) {
+ list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ putback_lru_page(page);
+ }
isolated = 0;
} else
count_vm_numa_event(NUMA_PAGE_MIGRATE);
@@ -1716,12 +1783,14 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
struct page *page, int node)
{
spinlock_t *ptl;
- unsigned long haddr = address & HPAGE_PMD_MASK;
pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0;
struct page *new_page = NULL;
struct mem_cgroup *memcg = NULL;
int page_lru = page_is_file_cache(page);
+ unsigned long mmun_start = address & HPAGE_PMD_MASK;
+ unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
+ pmd_t orig_entry;
/*
* Rate-limit the amount of data that is being migrated to a node.
@@ -1732,18 +1801,20 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
goto out_dropref;
new_page = alloc_pages_node(node,
- (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+ (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT,
+ HPAGE_PMD_ORDER);
if (!new_page)
goto out_fail;
- page_cpupid_xchg_last(new_page, page_cpupid_last(page));
-
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated) {
put_page(new_page);
goto out_fail;
}
+ if (mm_tlb_flush_pending(mm))
+ flush_tlb_range(vma, mmun_start, mmun_end);
+
/* Prepare a page as a migration target */
__set_page_locked(new_page);
SetPageSwapBacked(new_page);
@@ -1755,9 +1826,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
WARN_ON(PageLRU(new_page));
/* Recheck the target PMD */
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_same(*pmd, entry))) {
+ if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
+fail_putback:
spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
/* Reverse changes made by migrate_page_copy() */
if (TestClearPageActive(new_page))
@@ -1774,7 +1848,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
putback_lru_page(page);
mod_zone_page_state(page_zone(page),
NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
- goto out_fail;
+
+ goto out_unlock;
}
/*
@@ -1786,16 +1861,35 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
*/
mem_cgroup_prepare_migration(page, new_page, &memcg);
+ orig_entry = *pmd;
entry = mk_pmd(new_page, vma->vm_page_prot);
- entry = pmd_mknonnuma(entry);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
entry = pmd_mkhuge(entry);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_clear_flush(vma, haddr, pmd);
- set_pmd_at(mm, haddr, pmd, entry);
- page_add_new_anon_rmap(new_page, vma, haddr);
+ /*
+ * Clear the old entry under pagetable lock and establish the new PTE.
+ * Any parallel GUP will either observe the old page blocking on the
+ * page lock, block on the page table lock or observe the new page.
+ * The SetPageUptodate on the new page and page_add_new_anon_rmap
+ * guarantee the copy is visible before the pagetable update.
+ */
+ flush_cache_range(vma, mmun_start, mmun_end);
+ page_add_anon_rmap(new_page, vma, mmun_start);
+ pmdp_clear_flush(vma, mmun_start, pmd);
+ set_pmd_at(mm, mmun_start, pmd, entry);
+ flush_tlb_range(vma, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
+
+ if (page_count(page) != 2) {
+ set_pmd_at(mm, mmun_start, pmd, orig_entry);
+ flush_tlb_range(vma, mmun_start, mmun_end);
+ update_mmu_cache_pmd(vma, address, &entry);
+ page_remove_rmap(new_page);
+ goto fail_putback;
+ }
+
page_remove_rmap(page);
+
/*
* Finish the charge transaction under the page table lock to
* prevent split_huge_page() from dividing up the charge
@@ -1803,6 +1897,11 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
*/
mem_cgroup_end_migration(memcg, page, new_page, true);
spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ /* Take an "isolate" reference and put new page on the LRU. */
+ get_page(new_page);
+ putback_lru_page(new_page);
unlock_page(new_page);
unlock_page(page);
@@ -1820,10 +1919,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
out_fail:
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
out_dropref:
- entry = pmd_mknonnuma(entry);
- set_pmd_at(mm, haddr, pmd, entry);
- update_mmu_cache_pmd(vma, address, &entry);
+ ptl = pmd_lock(mm, pmd);
+ if (pmd_same(*pmd, entry)) {
+ entry = pmd_mknonnuma(entry);
+ set_pmd_at(mm, mmun_start, pmd, entry);
+ update_mmu_cache_pmd(vma, address, &entry);
+ }
+ spin_unlock(ptl);
+out_unlock:
unlock_page(page);
put_page(page);
return 0;
diff --git a/mm/mincore.c b/mm/mincore.c
index da2be56a7b8..725c8096104 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
- page = find_get_page(mapping, pgoff);
#ifdef CONFIG_SWAP
- /* shmem/tmpfs may return swap: account for swapcache page too. */
- if (radix_tree_exceptional_entry(page)) {
- swp_entry_t swap = radix_to_swp_entry(page);
- page = find_get_page(swap_address_space(swap), swap.val);
- }
+ if (shmem_mapping(mapping)) {
+ page = find_get_entry(mapping, pgoff);
+ /*
+ * shmem/tmpfs may return swap: account for swapcache
+ * page too.
+ */
+ if (radix_tree_exceptional_entry(page)) {
+ swp_entry_t swp = radix_to_swp_entry(page);
+ page = find_get_page(swap_address_space(swp), swp.val);
+ }
+ } else
+ page = find_get_page(mapping, pgoff);
+#else
+ page = find_get_page(mapping, pgoff);
#endif
if (page) {
present = PageUptodate(page);
@@ -225,13 +233,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
- if (is_vm_hugetlb_page(vma)) {
- mincore_hugetlb_page_range(vma, addr, end, vec);
- return (end - addr) >> PAGE_SHIFT;
- }
-
- end = pmd_addr_end(addr, end);
-
if (is_vm_hugetlb_page(vma))
mincore_hugetlb_page_range(vma, addr, end, vec);
else
diff --git a/mm/mlock.c b/mm/mlock.c
index d480cd6fc47..b1eb5363400 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -79,6 +79,7 @@ void clear_page_mlock(struct page *page)
*/
void mlock_vma_page(struct page *page)
{
+ /* Serialize with page migration */
BUG_ON(!PageLocked(page));
if (!TestSetPageMlocked(page)) {
@@ -91,6 +92,26 @@ void mlock_vma_page(struct page *page)
}
/*
+ * Isolate a page from LRU with optional get_page() pin.
+ * Assumes lru_lock already held and page already pinned.
+ */
+static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
+{
+ if (PageLRU(page)) {
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_page_lruvec(page, page_zone(page));
+ if (getpage)
+ get_page(page);
+ ClearPageLRU(page);
+ del_page_from_lru_list(page, lruvec, page_lru(page));
+ return true;
+ }
+
+ return false;
+}
+
+/*
* Finish munlock after successful page isolation
*
* Page must be locked. This is a wrapper for try_to_munlock()
@@ -126,14 +147,17 @@ static void __munlock_isolated_page(struct page *page)
static void __munlock_isolation_failed(struct page *page)
{
if (PageUnevictable(page))
- count_vm_event(UNEVICTABLE_PGSTRANDED);
+ __count_vm_event(UNEVICTABLE_PGSTRANDED);
else
- count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+ __count_vm_event(UNEVICTABLE_PGMUNLOCKED);
}
/**
* munlock_vma_page - munlock a vma page
- * @page - page to be unlocked
+ * @page - page to be unlocked, either a normal page or THP page head
+ *
+ * returns the size of the page as a page mask (0 for normal page,
+ * HPAGE_PMD_NR - 1 for THP head page)
*
* called from munlock()/munmap() path with page supposedly on the LRU.
* When we munlock a page, because the vma where we found the page is being
@@ -148,21 +172,37 @@ static void __munlock_isolation_failed(struct page *page)
*/
unsigned int munlock_vma_page(struct page *page)
{
- unsigned int page_mask = 0;
+ unsigned int nr_pages;
+ struct zone *zone = page_zone(page);
+ /* For try_to_munlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
- if (TestClearPageMlocked(page)) {
- unsigned int nr_pages = hpage_nr_pages(page);
- mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
- page_mask = nr_pages - 1;
- if (!isolate_lru_page(page))
- __munlock_isolated_page(page);
- else
- __munlock_isolation_failed(page);
+ /*
+ * Serialize with any parallel __split_huge_page_refcount() which
+ * might otherwise copy PageMlocked to part of the tail pages before
+ * we clear it in the head page. It also stabilizes hpage_nr_pages().
+ */
+ spin_lock_irq(&zone->lru_lock);
+
+ nr_pages = hpage_nr_pages(page);
+ if (!TestClearPageMlocked(page))
+ goto unlock_out;
+
+ __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
+
+ if (__munlock_isolate_lru_page(page, true)) {
+ spin_unlock_irq(&zone->lru_lock);
+ __munlock_isolated_page(page);
+ goto out;
}
+ __munlock_isolation_failed(page);
+
+unlock_out:
+ spin_unlock_irq(&zone->lru_lock);
- return page_mask;
+out:
+ return nr_pages - 1;
}
/**
@@ -241,8 +281,8 @@ static int __mlock_posix_error_return(long retval)
static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
int *pgrescued)
{
- VM_BUG_ON(PageLRU(page));
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (page_mapcount(page) <= 1 && page_evictable(page)) {
pagevec_add(pvec, page);
@@ -286,50 +326,45 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
{
int i;
int nr = pagevec_count(pvec);
- int delta_munlocked = -nr;
+ int delta_munlocked;
struct pagevec pvec_putback;
int pgrescued = 0;
+ pagevec_init(&pvec_putback, 0);
+
/* Phase 1: page isolation */
spin_lock_irq(&zone->lru_lock);
for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i];
if (TestClearPageMlocked(page)) {
- struct lruvec *lruvec;
- int lru;
-
- if (PageLRU(page)) {
- lruvec = mem_cgroup_page_lruvec(page, zone);
- lru = page_lru(page);
- /*
- * We already have pin from follow_page_mask()
- * so we can spare the get_page() here.
- */
- ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, lru);
- } else {
- __munlock_isolation_failed(page);
- goto skip_munlock;
- }
-
- } else {
-skip_munlock:
/*
- * We won't be munlocking this page in the next phase
- * but we still need to release the follow_page_mask()
- * pin.
+ * We already have pin from follow_page_mask()
+ * so we can spare the get_page() here.
*/
- pvec->pages[i] = NULL;
- put_page(page);
- delta_munlocked++;
+ if (__munlock_isolate_lru_page(page, false))
+ continue;
+ else
+ __munlock_isolation_failed(page);
}
+
+ /*
+ * We won't be munlocking this page in the next phase
+ * but we still need to release the follow_page_mask()
+ * pin. We cannot do it under lru_lock however. If it's
+ * the last pin, __page_cache_release() would deadlock.
+ */
+ pagevec_add(&pvec_putback, pvec->pages[i]);
+ pvec->pages[i] = NULL;
}
+ delta_munlocked = -nr + pagevec_count(&pvec_putback);
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
spin_unlock_irq(&zone->lru_lock);
+ /* Now we can release pins of pages that we are not munlocking */
+ pagevec_release(&pvec_putback);
+
/* Phase 2: page munlock */
- pagevec_init(&pvec_putback, 0);
for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i];
@@ -440,7 +475,8 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
while (start < end) {
struct page *page = NULL;
- unsigned int page_mask, page_increm;
+ unsigned int page_mask;
+ unsigned long page_increm;
struct pagevec pvec;
struct zone *zone;
int zoneid;
@@ -490,7 +526,9 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
goto next;
}
}
- page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+ /* It's a bug to munlock in the middle of a THP page */
+ VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
+ page_increm = 1 + page_mask;
start += page_increm * PAGE_SIZE;
next:
cond_resched();
@@ -689,19 +727,21 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
lru_add_drain_all(); /* flush pagevec */
- down_write(&current->mm->mmap_sem);
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
start &= PAGE_MASK;
- locked = len >> PAGE_SHIFT;
- locked += current->mm->locked_vm;
-
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
+ locked = len >> PAGE_SHIFT;
+
+ down_write(&current->mm->mmap_sem);
+
+ locked += current->mm->locked_vm;
/* check against resource limits */
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
error = do_mlock(start, len, 1);
+
up_write(&current->mm->mmap_sem);
if (!error)
error = __mm_populate(start, len, 0);
@@ -712,11 +752,13 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
{
int ret;
- down_write(&current->mm->mmap_sem);
len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
start &= PAGE_MASK;
+
+ down_write(&current->mm->mmap_sem);
ret = do_mlock(start, len, 0);
up_write(&current->mm->mmap_sem);
+
return ret;
}
@@ -761,12 +803,12 @@ SYSCALL_DEFINE1(mlockall, int, flags)
if (flags & MCL_CURRENT)
lru_add_drain_all(); /* flush pagevec */
- down_write(&current->mm->mmap_sem);
-
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
ret = -ENOMEM;
+ down_write(&current->mm->mmap_sem);
+
if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
capable(CAP_IPC_LOCK))
ret = do_mlockall(flags);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 68562e92d50..4074caf9936 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -202,5 +202,4 @@ static int __init mm_sysfs_init(void)
return 0;
}
-
-__initcall(mm_sysfs_init);
+postcore_initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index 834b2d785f1..129b847d30c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -6,10 +6,13 @@
* Address space accounting code <alan@lxorguk.ukuu.org.uk>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
@@ -36,6 +39,7 @@
#include <linux/sched/sysctl.h>
#include <linux/notifier.h>
#include <linux/memory.h>
+#include <linux/printk.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -86,6 +90,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
@@ -359,20 +364,20 @@ static int browse_rb(struct rb_root *root)
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
if (vma->vm_start < prev) {
- printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
+ pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev);
bug = 1;
}
if (vma->vm_start < pend) {
- printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
+ pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend);
bug = 1;
}
if (vma->vm_start > vma->vm_end) {
- printk("vm_end %lx < vm_start %lx\n",
+ pr_info("vm_end %lx < vm_start %lx\n",
vma->vm_end, vma->vm_start);
bug = 1;
}
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
- printk("free gap %lx, correct %lx\n",
+ pr_info("free gap %lx, correct %lx\n",
vma->rb_subtree_gap,
vma_compute_subtree_gap(vma));
bug = 1;
@@ -386,7 +391,7 @@ static int browse_rb(struct rb_root *root)
for (nd = pn; nd; nd = rb_prev(nd))
j++;
if (i != j) {
- printk("backwards %d, forwards %d\n", j, i);
+ pr_info("backwards %d, forwards %d\n", j, i);
bug = 1;
}
return bug ? -1 : i;
@@ -404,7 +409,7 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
}
}
-void validate_mm(struct mm_struct *mm)
+static void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
@@ -421,17 +426,17 @@ void validate_mm(struct mm_struct *mm)
i++;
}
if (i != mm->map_count) {
- printk("map_count %d vm_next %d\n", mm->map_count, i);
+ pr_info("map_count %d vm_next %d\n", mm->map_count, i);
bug = 1;
}
if (highest_address != mm->highest_vm_end) {
- printk("mm->highest_vm_end %lx, found %lx\n",
+ pr_info("mm->highest_vm_end %lx, found %lx\n",
mm->highest_vm_end, highest_address);
bug = 1;
}
i = browse_rb(&mm->mm_rb);
if (i != mm->map_count) {
- printk("map_count %d rb %d\n", mm->map_count, i);
+ pr_info("map_count %d rb %d\n", mm->map_count, i);
bug = 1;
}
BUG_ON(bug);
@@ -638,11 +643,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct address_space *mapping = NULL;
- if (vma->vm_file)
+ if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
-
- if (mapping)
mutex_lock(&mapping->i_mmap_mutex);
+ }
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
@@ -680,8 +684,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
prev->vm_next = next = vma->vm_next;
if (next)
next->vm_prev = prev;
- if (mm->mmap_cache == vma)
- mm->mmap_cache = prev;
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
@@ -893,7 +898,15 @@ again: remove_next = 1 + (end > next->vm_end);
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags)
{
- if (vma->vm_flags ^ vm_flags)
+ /*
+ * VM_SOFTDIRTY should not prevent from VMA merging, if we
+ * match the flags but dirty bit -- the caller should mark
+ * merged VMA as dirty. If dirty bit won't be excluded from
+ * comparison, we increase pressue on the memory system forcing
+ * the kernel to generate new VMAs when old one could be
+ * extended instead.
+ */
+ if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
return 0;
if (vma->vm_file != file)
return 0;
@@ -1082,7 +1095,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
return a->vm_end == b->vm_start &&
mpol_equal(vma_policy(a), vma_policy(b)) &&
a->vm_file == b->vm_file &&
- !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
+ !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) &&
b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}
@@ -1190,6 +1203,24 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
+static inline int mlock_future_check(struct mm_struct *mm,
+ unsigned long flags,
+ unsigned long len)
+{
+ unsigned long locked, lock_limit;
+
+ /* mlock MCL_FUTURE? */
+ if (flags & VM_LOCKED) {
+ locked = len >> PAGE_SHIFT;
+ locked += mm->locked_vm;
+ lock_limit = rlimit(RLIMIT_MEMLOCK);
+ lock_limit >>= PAGE_SHIFT;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ return -EAGAIN;
+ }
+ return 0;
+}
+
/*
* The caller must hold down_write(&current->mm->mmap_sem).
*/
@@ -1251,16 +1282,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
if (!can_do_mlock())
return -EPERM;
- /* mlock MCL_FUTURE? */
- if (vm_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
+ if (mlock_future_check(mm, vm_flags, len))
+ return -EAGAIN;
if (file) {
struct inode *inode = file_inode(file);
@@ -1280,7 +1303,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
/*
* Make sure there are no mandatory locks on the file.
*/
- if (locks_verify_locked(inode))
+ if (locks_verify_locked(file))
return -EAGAIN;
vm_flags |= VM_SHARED | VM_MAYSHARE;
@@ -1970,34 +1993,33 @@ EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma = NULL;
+ struct rb_node *rb_node;
+ struct vm_area_struct *vma;
/* Check the cache first. */
- /* (Cache hit rate is typically around 35%.) */
- vma = ACCESS_ONCE(mm->mmap_cache);
- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
- struct rb_node *rb_node;
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
+ return vma;
- rb_node = mm->mm_rb.rb_node;
- vma = NULL;
+ rb_node = mm->mm_rb.rb_node;
+ vma = NULL;
- while (rb_node) {
- struct vm_area_struct *vma_tmp;
-
- vma_tmp = rb_entry(rb_node,
- struct vm_area_struct, vm_rb);
-
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
- if (vma)
- mm->mmap_cache = vma;
+ while (rb_node) {
+ struct vm_area_struct *tmp;
+
+ tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+ if (tmp->vm_end > addr) {
+ vma = tmp;
+ if (tmp->vm_start <= addr)
+ break;
+ rb_node = rb_node->rb_left;
+ } else
+ rb_node = rb_node->rb_right;
}
+
+ if (vma)
+ vmacache_update(addr, vma);
return vma;
}
@@ -2369,7 +2391,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
} else
mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
- mm->mmap_cache = NULL; /* Kill the cache. */
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
@@ -2591,18 +2615,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
if (error & ~PAGE_MASK)
return error;
- /*
- * mlock MCL_FUTURE?
- */
- if (mm->def_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = len >> PAGE_SHIFT;
- locked += mm->locked_vm;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- lock_limit >>= PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return -EAGAIN;
- }
+ error = mlock_future_check(mm, mm->def_flags, len);
+ if (error)
+ return error;
/*
* mm->mmap_sem is required to protect against another thread
@@ -2859,6 +2874,31 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages)
return 1;
}
+static int special_mapping_fault(struct vm_area_struct *vma,
+ struct vm_fault *vmf);
+
+/*
+ * Having a close hook prevents vma merging regardless of flags.
+ */
+static void special_mapping_close(struct vm_area_struct *vma)
+{
+}
+
+static const char *special_mapping_name(struct vm_area_struct *vma)
+{
+ return ((struct vm_special_mapping *)vma->vm_private_data)->name;
+}
+
+static const struct vm_operations_struct special_mapping_vmops = {
+ .close = special_mapping_close,
+ .fault = special_mapping_fault,
+ .name = special_mapping_name,
+};
+
+static const struct vm_operations_struct legacy_special_mapping_vmops = {
+ .close = special_mapping_close,
+ .fault = special_mapping_fault,
+};
static int special_mapping_fault(struct vm_area_struct *vma,
struct vm_fault *vmf)
@@ -2874,7 +2914,13 @@ static int special_mapping_fault(struct vm_area_struct *vma,
*/
pgoff = vmf->pgoff - vma->vm_pgoff;
- for (pages = vma->vm_private_data; pgoff && *pages; ++pages)
+ if (vma->vm_ops == &legacy_special_mapping_vmops)
+ pages = vma->vm_private_data;
+ else
+ pages = ((struct vm_special_mapping *)vma->vm_private_data)->
+ pages;
+
+ for (; pgoff && *pages; ++pages)
pgoff--;
if (*pages) {
@@ -2887,37 +2933,18 @@ static int special_mapping_fault(struct vm_area_struct *vma,
return VM_FAULT_SIGBUS;
}
-/*
- * Having a close hook prevents vma merging regardless of flags.
- */
-static void special_mapping_close(struct vm_area_struct *vma)
-{
-}
-
-static const struct vm_operations_struct special_mapping_vmops = {
- .close = special_mapping_close,
- .fault = special_mapping_fault,
-};
-
-/*
- * Called with mm->mmap_sem held for writing.
- * Insert a new vma covering the given region, with the given flags.
- * Its pages are supplied by the given array of struct page *.
- * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
- * The region past the last page supplied will always produce SIGBUS.
- * The array pointer and the pages it points to are assumed to stay alive
- * for as long as this mapping might exist.
- */
-int install_special_mapping(struct mm_struct *mm,
- unsigned long addr, unsigned long len,
- unsigned long vm_flags, struct page **pages)
+static struct vm_area_struct *__install_special_mapping(
+ struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, const struct vm_operations_struct *ops,
+ void *priv)
{
int ret;
struct vm_area_struct *vma;
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (unlikely(vma == NULL))
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&vma->anon_vma_chain);
vma->vm_mm = mm;
@@ -2927,8 +2954,8 @@ int install_special_mapping(struct mm_struct *mm,
vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
- vma->vm_ops = &special_mapping_vmops;
- vma->vm_private_data = pages;
+ vma->vm_ops = ops;
+ vma->vm_private_data = priv;
ret = insert_vm_struct(mm, vma);
if (ret)
@@ -2938,11 +2965,40 @@ int install_special_mapping(struct mm_struct *mm,
perf_event_mmap(vma);
- return 0;
+ return vma;
out:
kmem_cache_free(vm_area_cachep, vma);
- return ret;
+ return ERR_PTR(ret);
+}
+
+/*
+ * Called with mm->mmap_sem held for writing.
+ * Insert a new vma covering the given region, with the given flags.
+ * Its pages are supplied by the given array of struct page *.
+ * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
+ * The region past the last page supplied will always produce SIGBUS.
+ * The array pointer and the pages it points to are assumed to stay alive
+ * for as long as this mapping might exist.
+ */
+struct vm_area_struct *_install_special_mapping(
+ struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, const struct vm_special_mapping *spec)
+{
+ return __install_special_mapping(mm, addr, len, vm_flags,
+ &special_mapping_vmops, (void *)spec);
+}
+
+int install_special_mapping(struct mm_struct *mm,
+ unsigned long addr, unsigned long len,
+ unsigned long vm_flags, struct page **pages)
+{
+ struct vm_area_struct *vma = __install_special_mapping(
+ mm, addr, len, vm_flags, &legacy_special_mapping_vmops,
+ (void *)pages);
+
+ return PTR_ERR_OR_ZERO(vma);
}
static DEFINE_MUTEX(mm_all_locks_mutex);
@@ -3140,7 +3196,7 @@ static int init_user_reserve(void)
sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
return 0;
}
-module_init(init_user_reserve)
+subsys_initcall(init_user_reserve);
/*
* Initialise sysctl_admin_reserve_kbytes.
@@ -3161,7 +3217,7 @@ static int init_admin_reserve(void)
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
}
-module_init(init_admin_reserve)
+subsys_initcall(init_admin_reserve);
/*
* Reinititalise user and admin reserves if memory is added or removed.
@@ -3227,8 +3283,8 @@ static struct notifier_block reserve_mem_nb = {
static int __meminit init_reserve_notifier(void)
{
if (register_hotmemory_notifier(&reserve_mem_nb))
- printk("Failed registering memory add/remove notifier for admin reserve");
+ pr_err("Failed registering memory add/remove notifier for admin reserve\n");
return 0;
}
-module_init(init_reserve_notifier)
+subsys_initcall(init_reserve_notifier);
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 8a8cd0265e5..f802c2d216a 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -31,6 +31,9 @@ void use_mm(struct mm_struct *mm)
tsk->mm = mm;
switch_mm(active_mm, mm, tsk);
task_unlock(tsk);
+#ifdef finish_arch_post_lock_switch
+ finish_arch_post_lock_switch();
+#endif
if (active_mm != mm)
mmdrop(active_mm);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 93e6089cb45..41cefdf0aad 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -329,5 +329,4 @@ static int __init mmu_notifier_init(void)
{
return init_srcu_struct(&srcu);
}
-
-module_init(mmu_notifier_init);
+subsys_initcall(mmu_notifier_init);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 26667971c82..c43d557941f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/perf_event.h>
+#include <linux/ksm.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@@ -35,6 +36,34 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
}
#endif
+/*
+ * For a prot_numa update we only hold mmap_sem for read so there is a
+ * potential race with faulting where a pmd was temporarily none. This
+ * function checks for a transhuge pmd under the appropriate lock. It
+ * returns a pte if it was successfully locked or NULL if it raced with
+ * a transhuge insertion.
+ */
+static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, int prot_numa, spinlock_t **ptl)
+{
+ pte_t *pte;
+ spinlock_t *pmdl;
+
+ /* !prot_numa is protected by mmap_sem held for write */
+ if (!prot_numa)
+ return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
+
+ pmdl = pmd_lock(vma->vm_mm, pmd);
+ if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
+ spin_unlock(pmdl);
+ return NULL;
+ }
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
+ spin_unlock(pmdl);
+ return pte;
+}
+
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
@@ -44,7 +73,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
spinlock_t *ptl;
unsigned long pages = 0;
- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
+ if (!pte)
+ return 0;
+
arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
@@ -52,34 +84,32 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_t ptent;
bool updated = false;
- ptent = ptep_modify_prot_start(mm, addr, pte);
if (!prot_numa) {
+ ptent = ptep_modify_prot_start(mm, addr, pte);
+ if (pte_numa(ptent))
+ ptent = pte_mknonnuma(ptent);
ptent = pte_modify(ptent, newprot);
+ /*
+ * Avoid taking write faults for pages we
+ * know to be dirty.
+ */
+ if (dirty_accountable && pte_dirty(ptent))
+ ptent = pte_mkwrite(ptent);
+ ptep_modify_prot_commit(mm, addr, pte, ptent);
updated = true;
} else {
struct page *page;
page = vm_normal_page(vma, addr, oldpte);
- if (page) {
+ if (page && !PageKsm(page)) {
if (!pte_numa(oldpte)) {
- ptent = pte_mknuma(ptent);
+ ptep_set_numa(mm, addr, pte);
updated = true;
}
}
}
-
- /*
- * Avoid taking write faults for pages we know to be
- * dirty.
- */
- if (dirty_accountable && pte_dirty(ptent)) {
- ptent = pte_mkwrite(ptent);
- updated = true;
- }
-
if (updated)
pages++;
- ptep_modify_prot_commit(mm, addr, pte, ptent);
} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -110,15 +140,26 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pmd_t *pmd;
+ struct mm_struct *mm = vma->vm_mm;
unsigned long next;
unsigned long pages = 0;
unsigned long nr_huge_updates = 0;
+ unsigned long mni_start = 0;
pmd = pmd_offset(pud, addr);
do {
unsigned long this_pages;
next = pmd_addr_end(addr, end);
+ if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))
+ continue;
+
+ /* invoke the mmu notifier if the pmd is populated */
+ if (!mni_start) {
+ mni_start = addr;
+ mmu_notifier_invalidate_range_start(mm, mni_start, end);
+ }
+
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
split_huge_page_pmd(vma, addr, pmd);
@@ -131,18 +172,21 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pages += HPAGE_PMD_NR;
nr_huge_updates++;
}
+
+ /* huge pmd was handled */
continue;
}
}
- /* fall through */
+ /* fall through, the trans huge pmd just split */
}
- if (pmd_none_or_clear_bad(pmd))
- continue;
this_pages = change_pte_range(vma, pmd, addr, next, newprot,
dirty_accountable, prot_numa);
pages += this_pages;
} while (pmd++, addr = next, addr != end);
+ if (mni_start)
+ mmu_notifier_invalidate_range_end(mm, mni_start, end);
+
if (nr_huge_updates)
count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
return pages;
@@ -181,6 +225,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
+ set_tlb_flush_pending(mm);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
@@ -192,6 +237,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
/* Only flush the TLB if we actually modified any entries: */
if (pages)
flush_tlb_range(vma, start, end);
+ clear_tlb_flush_pending(mm);
return pages;
}
@@ -200,15 +246,12 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
{
- struct mm_struct *mm = vma->vm_mm;
unsigned long pages;
- mmu_notifier_invalidate_range_start(mm, start, end);
if (is_vm_hugetlb_page(vma))
pages = hugetlb_change_protection(vma, start, end, newprot);
else
pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
- mmu_notifier_invalidate_range_end(mm, start, end);
return pages;
}
diff --git a/mm/mremap.c b/mm/mremap.c
index 0843feb66f3..05f1180e9f2 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -194,10 +194,17 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
break;
if (pmd_trans_huge(*old_pmd)) {
int err = 0;
- if (extent == HPAGE_PMD_SIZE)
+ if (extent == HPAGE_PMD_SIZE) {
+ VM_BUG_ON(vma->vm_file || !vma->anon_vma);
+ /* See comment in move_ptes() */
+ if (need_rmap_locks)
+ anon_vma_lock_write(vma->anon_vma);
err = move_huge_pmd(vma, new_vma, old_addr,
new_addr, old_end,
old_pmd, new_pmd);
+ if (need_rmap_locks)
+ anon_vma_unlock_write(vma->anon_vma);
+ }
if (err > 0) {
need_flush = true;
continue;
diff --git a/mm/msync.c b/mm/msync.c
index 632df4527c0..992a1673d48 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -58,6 +58,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
vma = find_vma(mm, start);
for (;;) {
struct file *file;
+ loff_t fstart, fend;
/* Still start < end. */
error = -ENOMEM;
@@ -77,12 +78,18 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
goto out_unlock;
}
file = vma->vm_file;
+ fstart = (start - vma->vm_start) +
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ fend = fstart + (min(end, vma->vm_end) - start) - 1;
start = vma->vm_end;
if ((flags & MS_SYNC) && file &&
(vma->vm_flags & VM_SHARED)) {
get_file(file);
up_read(&mm->mmap_sem);
- error = vfs_fsync(file, 0);
+ if (vma->vm_flags & VM_NONLINEAR)
+ error = vfs_fsync(file, 1);
+ else
+ error = vfs_fsync_range(file, fstart, fend, 1);
fput(file);
if (error || start >= end)
goto out;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 2c254d37465..7ed58602e71 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -41,11 +41,13 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
if (limit > memblock.current_limit)
limit = memblock.current_limit;
- addr = memblock_find_in_range_node(goal, limit, size, align, nid);
+ addr = memblock_find_in_range_node(size, align, goal, limit, nid);
if (!addr)
return NULL;
- memblock_reserve(addr, size);
+ if (memblock_reserve(addr, size))
+ return NULL;
+
ptr = phys_to_virt(addr);
memset(ptr, 0, size);
/*
@@ -114,16 +116,27 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
static unsigned long __init free_low_memory_core_early(void)
{
unsigned long count = 0;
- phys_addr_t start, end, size;
+ phys_addr_t start, end;
u64 i;
- for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
+ for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
count += __free_memory_core(start, end);
- /* free range that is used for reserved array if we allocate it */
- size = get_allocated_memblock_reserved_regions_info(&start);
- if (size)
- count += __free_memory_core(start, start + size);
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+ {
+ phys_addr_t size;
+
+ /* Free memblock.reserved array if it was allocated */
+ size = get_allocated_memblock_reserved_regions_info(&start);
+ if (size)
+ count += __free_memory_core(start, start + size);
+
+ /* Free memblock.memory array if it was allocated */
+ size = get_allocated_memblock_memory_regions_info(&start);
+ if (size)
+ count += __free_memory_core(start, start + size);
+ }
+#endif
return count;
}
@@ -161,7 +174,7 @@ unsigned long __init free_all_bootmem(void)
reset_all_zones_managed_pages();
/*
- * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
+ * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
*/
@@ -184,7 +197,6 @@ unsigned long __init free_all_bootmem(void)
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
- kmemleak_free_part(__va(physaddr), size);
memblock_free(physaddr, size);
}
@@ -199,7 +211,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
*/
void __init free_bootmem(unsigned long addr, unsigned long size)
{
- kmemleak_free_part(__va(addr), size);
memblock_free(addr, size);
}
@@ -215,7 +226,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
restart:
- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+ ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit);
if (ptr)
return ptr;
@@ -299,7 +310,7 @@ again:
if (ptr)
return ptr;
- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
+ ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align,
goal, limit);
if (ptr)
return ptr;
@@ -321,7 +332,7 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
}
-void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal,
unsigned long limit)
{
diff --git a/mm/nommu.c b/mm/nommu.c
index fec093adad9..4a852f6c570 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -13,8 +13,11 @@
* Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/export.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/file.h>
@@ -24,12 +27,14 @@
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/compiler.h>
#include <linux/mount.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/audit.h>
#include <linux/sched/sysctl.h>
+#include <linux/printk.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
@@ -60,6 +65,7 @@ unsigned long highest_memmap_pfn;
struct percpu_counter vm_committed_as;
int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio = 50; /* default is 50% */
+unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
@@ -295,7 +301,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
memcpy(addr, buf, count);
- return(count);
+ return count;
}
/*
@@ -458,7 +464,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases);
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
* have one.
*/
-void __attribute__((weak)) vmalloc_sync_all(void)
+void __weak vmalloc_sync_all(void)
{
}
@@ -767,16 +773,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
*/
static void delete_vma_from_mm(struct vm_area_struct *vma)
{
+ int i;
struct address_space *mapping;
struct mm_struct *mm = vma->vm_mm;
+ struct task_struct *curr = current;
kenter("%p", vma);
protect_vma(vma, 0);
mm->map_count--;
- if (mm->mmap_cache == vma)
- mm->mmap_cache = NULL;
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ /* if the vma is cached, invalidate the entire cache */
+ if (curr->vmacache[i] == vma) {
+ vmacache_invalidate(mm);
+ break;
+ }
+ }
/* remove the VMA from the mapping */
if (vma->vm_file) {
@@ -824,8 +837,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
struct vm_area_struct *vma;
/* check the cache first */
- vma = ACCESS_ONCE(mm->mmap_cache);
- if (vma && vma->vm_start <= addr && vma->vm_end > addr)
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
return vma;
/* trawl the list (there may be multiple mappings in which addr
@@ -834,7 +847,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end > addr) {
- mm->mmap_cache = vma;
+ vmacache_update(addr, vma);
return vma;
}
}
@@ -873,8 +886,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
unsigned long end = addr + len;
/* check the cache first */
- vma = mm->mmap_cache;
- if (vma && vma->vm_start == addr && vma->vm_end == end)
+ vma = vmacache_find_exact(mm, addr, end);
+ if (vma)
return vma;
/* trawl the list (there may be multiple mappings in which addr
@@ -885,7 +898,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end == end) {
- mm->mmap_cache = vma;
+ vmacache_update(addr, vma);
return vma;
}
}
@@ -994,7 +1007,7 @@ static int validate_mmap_request(struct file *file,
(file->f_mode & FMODE_WRITE))
return -EACCES;
- if (locks_verify_locked(file_inode(file)))
+ if (locks_verify_locked(file))
return -EAGAIN;
if (!(capabilities & BDI_CAP_MAP_DIRECT))
@@ -1002,8 +1015,7 @@ static int validate_mmap_request(struct file *file,
/* we mustn't privatise shared mappings */
capabilities &= ~BDI_CAP_MAP_COPY;
- }
- else {
+ } else {
/* we're going to read the file into private memory we
* allocate */
if (!(capabilities & BDI_CAP_MAP_COPY))
@@ -1034,23 +1046,20 @@ static int validate_mmap_request(struct file *file,
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
if (prot & PROT_EXEC)
return -EPERM;
- }
- else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
+ } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
/* handle implication of PROT_EXEC by PROT_READ */
if (current->personality & READ_IMPLIES_EXEC) {
if (capabilities & BDI_CAP_EXEC_MAP)
prot |= PROT_EXEC;
}
- }
- else if ((prot & PROT_READ) &&
+ } else if ((prot & PROT_READ) &&
(prot & PROT_EXEC) &&
!(capabilities & BDI_CAP_EXEC_MAP)
) {
/* backing file is not executable, try to copy */
capabilities &= ~BDI_CAP_MAP_DIRECT;
}
- }
- else {
+ } else {
/* anonymous mappings are always memory backed and can be
* privately mapped
*/
@@ -1240,7 +1249,7 @@ error_free:
return ret;
enomem:
- printk("Allocation of length %lu from process %d (%s) failed\n",
+ pr_err("Allocation of length %lu from process %d (%s) failed\n",
len, current->pid, current->comm);
show_free_areas(0);
return -ENOMEM;
@@ -1658,7 +1667,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
/* find the first potentially overlapping VMA */
vma = find_vma(mm, start);
if (!vma) {
- static int limit = 0;
+ static int limit;
if (limit < 5) {
printk(KERN_WARNING
"munmap of memory not mmapped by process %d"
@@ -1984,6 +1993,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
+void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ BUG();
+}
+EXPORT_SYMBOL(filemap_map_pages);
+
int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
unsigned long size, pgoff_t pgoff)
{
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e4a600a616..3291e82d435 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -47,19 +47,21 @@ static DEFINE_SPINLOCK(zone_scan_lock);
#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
- * @tsk: task struct of which task to consider
+ * @start: task struct of which task to consider
* @mask: nodemask passed to page allocator for mempolicy ooms
*
* Task eligibility is determined by whether or not a candidate task, @tsk,
* shares the same mempolicy nodes as current if it is bound by such a policy
* and whether or not it has the same set of allowed cpuset nodes.
*/
-static bool has_intersects_mems_allowed(struct task_struct *tsk,
+static bool has_intersects_mems_allowed(struct task_struct *start,
const nodemask_t *mask)
{
- struct task_struct *start = tsk;
+ struct task_struct *tsk;
+ bool ret = false;
- do {
+ rcu_read_lock();
+ for_each_thread(start, tsk) {
if (mask) {
/*
* If this is a mempolicy constrained oom, tsk's
@@ -67,19 +69,20 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
* mempolicy intersects current, otherwise it may be
* needlessly killed.
*/
- if (mempolicy_nodemask_intersects(tsk, mask))
- return true;
+ ret = mempolicy_nodemask_intersects(tsk, mask);
} else {
/*
* This is not a mempolicy constrained oom, so only
* check the mems of tsk's cpuset.
*/
- if (cpuset_mems_allowed_intersects(current, tsk))
- return true;
+ ret = cpuset_mems_allowed_intersects(current, tsk);
}
- } while_each_thread(start, tsk);
+ if (ret)
+ break;
+ }
+ rcu_read_unlock();
- return false;
+ return ret;
}
#else
static bool has_intersects_mems_allowed(struct task_struct *tsk,
@@ -97,16 +100,21 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
*/
struct task_struct *find_lock_task_mm(struct task_struct *p)
{
- struct task_struct *t = p;
+ struct task_struct *t;
- do {
+ rcu_read_lock();
+
+ for_each_thread(p, t) {
task_lock(t);
if (likely(t->mm))
- return t;
+ goto found;
task_unlock(t);
- } while_each_thread(p, t);
+ }
+ t = NULL;
+found:
+ rcu_read_unlock();
- return NULL;
+ return t;
}
/* return true if the task is not adequate as candidate victim task. */
@@ -170,7 +178,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* implementation used by LSMs.
*/
if (has_capability_noaudit(p, CAP_SYS_ADMIN))
- adj -= 30;
+ points -= (points * 3) / 100;
/* Normalize to oom_score_adj units */
adj *= totalpages / 1000;
@@ -301,7 +309,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
unsigned long chosen_points = 0;
rcu_read_lock();
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
unsigned int points;
switch (oom_scan_process_thread(p, totalpages, nodemask,
@@ -319,11 +327,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
break;
};
points = oom_badness(p, NULL, nodemask, totalpages);
- if (points > chosen_points) {
- chosen = p;
- chosen_points = points;
- }
- } while_each_thread(g, p);
+ if (!points || points < chosen_points)
+ continue;
+ /* Prefer thread group leaders for display purposes */
+ if (points == chosen_points && thread_group_leader(chosen))
+ continue;
+
+ chosen = p;
+ chosen_points = points;
+ }
if (chosen)
get_task_struct(chosen);
rcu_read_unlock();
@@ -406,7 +418,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
{
struct task_struct *victim = p;
struct task_struct *child;
- struct task_struct *t = p;
+ struct task_struct *t;
struct mm_struct *mm;
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -437,7 +449,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* still freeing memory.
*/
read_lock(&tasklist_lock);
- do {
+ for_each_thread(p, t) {
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
@@ -455,13 +467,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
get_task_struct(victim);
}
}
- } while_each_thread(p, t);
+ }
read_unlock(&tasklist_lock);
- rcu_read_lock();
p = find_lock_task_mm(victim);
if (!p) {
- rcu_read_unlock();
put_task_struct(victim);
return;
} else if (victim != p) {
@@ -487,6 +497,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* That thread will now get access to memory reserves since it has a
* pending fatal signal.
*/
+ rcu_read_lock();
for_each_process(p)
if (p->mm == mm && !same_thread_group(p, victim) &&
!(p->flags & PF_KTHREAD)) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 63807583d8e..e0c943014eb 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -156,24 +156,6 @@ static unsigned long writeout_period_time = 0;
#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
/*
- * Work out the current dirty-memory clamping and background writeout
- * thresholds.
- *
- * The main aim here is to lower them aggressively if there is a lot of mapped
- * memory around. To avoid stressing page reclaim with lots of unreclaimable
- * pages. It is better to clamp down on writers than to start swapping, and
- * performing lots of scanning.
- *
- * We only allow 1/2 of the currently-unmapped memory to be dirtied.
- *
- * We don't permit the clamping level to fall below 5% - that is getting rather
- * excessive.
- *
- * We make sure that the background writeout level is below the adjusted
- * clamping level.
- */
-
-/*
* In a memory zone, there is a certain amount of pages we consider
* available for the page cache, which is essentially the number of
* free and reclaimable pages, minus some zone reserves to protect
@@ -191,6 +173,26 @@ static unsigned long writeout_period_time = 0;
* global dirtyable memory first.
*/
+/**
+ * zone_dirtyable_memory - number of dirtyable pages in a zone
+ * @zone: the zone
+ *
+ * Returns the zone's number of pages potentially available for dirty
+ * page cache. This is the base value for the per-zone dirty limits.
+ */
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+ unsigned long nr_pages;
+
+ nr_pages = zone_page_state(zone, NR_FREE_PAGES);
+ nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+
+ nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
+ nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
+
+ return nr_pages;
+}
+
static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
@@ -198,11 +200,9 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
unsigned long x = 0;
for_each_node_state(node, N_HIGH_MEMORY) {
- struct zone *z =
- &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+ struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
- x += zone_page_state(z, NR_FREE_PAGES) +
- zone_reclaimable_pages(z) - z->dirty_balance_reserve;
+ x += zone_dirtyable_memory(z);
}
/*
* Unreclaimable memory (kernel memory or anonymous memory
@@ -238,9 +238,12 @@ static unsigned long global_dirtyable_memory(void)
{
unsigned long x;
- x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
+ x = global_page_state(NR_FREE_PAGES);
x -= min(x, dirty_balance_reserve);
+ x += global_page_state(NR_INACTIVE_FILE);
+ x += global_page_state(NR_ACTIVE_FILE);
+
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
@@ -289,32 +292,6 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
}
/**
- * zone_dirtyable_memory - number of dirtyable pages in a zone
- * @zone: the zone
- *
- * Returns the zone's number of pages potentially available for dirty
- * page cache. This is the base value for the per-zone dirty limits.
- */
-static unsigned long zone_dirtyable_memory(struct zone *zone)
-{
- /*
- * The effective global number of dirtyable pages may exclude
- * highmem as a big-picture measure to keep the ratio between
- * dirty memory and lowmem reasonable.
- *
- * But this function is purely about the individual zone and a
- * highmem zone can hold its share of dirty pages, so we don't
- * care about vm_highmem_is_dirtyable here.
- */
- unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
- zone_reclaimable_pages(zone);
-
- /* don't allow this to underflow */
- nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
- return nr_pages;
-}
-
-/**
* zone_dirty_limit - maximum number of dirty pages allowed in a zone
* @zone: the zone
*
@@ -598,14 +575,14 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
* (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
* => fast response on large errors; small oscillation near setpoint
*/
-static inline long long pos_ratio_polynom(unsigned long setpoint,
+static long long pos_ratio_polynom(unsigned long setpoint,
unsigned long dirty,
unsigned long limit)
{
long long pos_ratio;
long x;
- x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
+ x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
limit - setpoint + 1);
pos_ratio = x;
pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
@@ -847,7 +824,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
x_intercept = bdi_setpoint + span;
if (bdi_dirty < x_intercept - span / 4) {
- pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty),
+ pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
x_intercept - bdi_setpoint + 1);
} else
pos_ratio /= 4;
@@ -1329,9 +1306,9 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
*bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
if (bdi_bg_thresh)
- *bdi_bg_thresh = div_u64((u64)*bdi_thresh *
- background_thresh,
- dirty_thresh);
+ *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh *
+ background_thresh,
+ dirty_thresh) : 0;
/*
* In order to avoid the stacked BDI deadlock we need
@@ -1567,9 +1544,9 @@ pause:
bdi_start_background_writeback(bdi);
}
-void set_page_dirty_balance(struct page *page, int page_mkwrite)
+void set_page_dirty_balance(struct page *page)
{
- if (set_page_dirty(page) || page_mkwrite) {
+ if (set_page_dirty(page)) {
struct address_space *mapping = page_mapping(page);
if (mapping)
@@ -1628,7 +1605,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
* 1000+ tasks, all of them start dirtying pages at exactly the same
* time, hence all honoured too large initial task->nr_dirtied_pause.
*/
- p = &__get_cpu_var(bdp_ratelimits);
+ p = this_cpu_ptr(&bdp_ratelimits);
if (unlikely(current->nr_dirtied >= ratelimit))
*p = 0;
else if (unlikely(*p >= ratelimit_pages)) {
@@ -1640,7 +1617,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
* short-lived tasks (eg. gcc invocations in a kernel build) escaping
* the dirty throttling and livelock other long-run dirtiers.
*/
- p = &__get_cpu_var(dirty_throttle_leaks);
+ p = this_cpu_ptr(&dirty_throttle_leaks);
if (*p > 0 && current->nr_dirtied < ratelimit) {
unsigned long nr_pages_dirtied;
nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
@@ -1687,7 +1664,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
-int dirty_writeback_centisecs_handler(ctl_table *table, int write,
+int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec(table, write, buffer, length, ppos);
@@ -2178,11 +2155,12 @@ int __set_page_dirty_nobuffers(struct page *page)
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
struct address_space *mapping2;
+ unsigned long flags;
if (!mapping)
return 1;
- spin_lock_irq(&mapping->tree_lock);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
mapping2 = page_mapping(page);
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
@@ -2191,7 +2169,7 @@ int __set_page_dirty_nobuffers(struct page *page)
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
if (mapping->host) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -2402,7 +2380,7 @@ int test_clear_page_writeback(struct page *page)
return ret;
}
-int test_set_page_writeback(struct page *page)
+int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
int ret;
@@ -2427,9 +2405,10 @@ int test_set_page_writeback(struct page *page)
radix_tree_tag_clear(&mapping->page_tree,
page_index(page),
PAGECACHE_TAG_DIRTY);
- radix_tree_tag_clear(&mapping->page_tree,
- page_index(page),
- PAGECACHE_TAG_TOWRITE);
+ if (!keep_write)
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_TOWRITE);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
} else {
ret = TestSetPageWriteback(page);
@@ -2440,7 +2419,7 @@ int test_set_page_writeback(struct page *page)
return ret;
}
-EXPORT_SYMBOL(test_set_page_writeback);
+EXPORT_SYMBOL(__test_set_page_writeback);
/*
* Return true if any of the pages in the mapping are marked with the
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 580a5f075ed..ef44ad736ca 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_FRACTION (8)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
@@ -205,7 +206,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
};
int min_free_kbytes = 1024;
-int user_min_free_kbytes;
+int user_min_free_kbytes = -1;
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
@@ -261,8 +262,9 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
} while (zone_span_seqretry(zone, seq));
if (ret)
- pr_err("page %lu outside zone [ %lu - %lu ]\n",
- pfn, start_pfn, start_pfn + sp);
+ pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
+ pfn, zone_to_nid(zone), zone->name,
+ start_pfn, start_pfn + sp);
return ret;
}
@@ -295,7 +297,8 @@ static inline int bad_range(struct zone *zone, struct page *page)
}
#endif
-static void bad_page(struct page *page)
+static void bad_page(struct page *page, const char *reason,
+ unsigned long bad_flags)
{
static unsigned long resume;
static unsigned long nr_shown;
@@ -329,7 +332,7 @@ static void bad_page(struct page *page)
printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
- dump_page(page);
+ dump_page_badflags(page, reason, bad_flags);
print_modules();
dump_stack();
@@ -369,9 +372,11 @@ void prep_compound_page(struct page *page, unsigned long order)
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
- __SetPageTail(p);
set_page_count(p, 0);
p->first_page = page;
+ /* Make sure p->first_page is always valid for PageTail() */
+ smp_wmb();
+ __SetPageTail(p);
}
}
@@ -383,7 +388,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
int bad = 0;
if (unlikely(compound_order(page) != order)) {
- bad_page(page);
+ bad_page(page, "wrong compound order", 0);
bad++;
}
@@ -392,8 +397,11 @@ static int destroy_compound_page(struct page *page, unsigned long order)
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
- if (unlikely(!PageTail(p) || (p->first_page != page))) {
- bad_page(page);
+ if (unlikely(!PageTail(p))) {
+ bad_page(page, "PageTail not set", 0);
+ bad++;
+ } else if (unlikely(p->first_page != page)) {
+ bad_page(page, "first_page not consistent", 0);
bad++;
}
__ClearPageTail(p);
@@ -402,7 +410,8 @@ static int destroy_compound_page(struct page *page, unsigned long order)
return bad;
}
-static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+static inline void prep_zero_page(struct page *page, unsigned int order,
+ gfp_t gfp_flags)
{
int i;
@@ -446,7 +455,7 @@ static inline void set_page_guard_flag(struct page *page) { }
static inline void clear_page_guard_flag(struct page *page) { }
#endif
-static inline void set_page_order(struct page *page, int order)
+static inline void set_page_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
__SetPageBuddy(page);
@@ -497,21 +506,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
* For recording page's order, we use page_private(page).
*/
static inline int page_is_buddy(struct page *page, struct page *buddy,
- int order)
+ unsigned int order)
{
if (!pfn_valid_within(page_to_pfn(buddy)))
return 0;
- if (page_zone_id(page) != page_zone_id(buddy))
- return 0;
-
if (page_is_guard(buddy) && page_order(buddy) == order) {
- VM_BUG_ON(page_count(buddy) != 0);
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
+
+ if (page_zone_id(page) != page_zone_id(buddy))
+ return 0;
+
return 1;
}
if (PageBuddy(buddy) && page_order(buddy) == order) {
- VM_BUG_ON(page_count(buddy) != 0);
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
+
+ /*
+ * zone check is done late to avoid uselessly
+ * calculating zone/node ids for pages that could
+ * never merge.
+ */
+ if (page_zone_id(page) != page_zone_id(buddy))
+ return 0;
+
return 1;
}
return 0;
@@ -543,6 +562,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
*/
static inline void __free_one_page(struct page *page,
+ unsigned long pfn,
struct zone *zone, unsigned int order,
int migratetype)
{
@@ -559,10 +579,10 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON(migratetype == -1);
- page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
+ page_idx = pfn & ((1 << MAX_ORDER) - 1);
- VM_BUG_ON(page_idx & ((1 << order) - 1));
- VM_BUG_ON(bad_range(zone, page));
+ VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
+ VM_BUG_ON_PAGE(bad_range(zone, page), page);
while (order < MAX_ORDER-1) {
buddy_idx = __find_buddy_index(page_idx, order);
@@ -618,12 +638,23 @@ out:
static inline int free_pages_check(struct page *page)
{
- if (unlikely(page_mapcount(page) |
- (page->mapping != NULL) |
- (atomic_read(&page->_count) != 0) |
- (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
- (mem_cgroup_bad_page_check(page)))) {
- bad_page(page);
+ const char *bad_reason = NULL;
+ unsigned long bad_flags = 0;
+
+ if (unlikely(page_mapcount(page)))
+ bad_reason = "nonzero mapcount";
+ if (unlikely(page->mapping != NULL))
+ bad_reason = "non-NULL mapping";
+ if (unlikely(atomic_read(&page->_count) != 0))
+ bad_reason = "nonzero _count";
+ if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
+ bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
+ bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
+ }
+ if (unlikely(mem_cgroup_bad_page_check(page)))
+ bad_reason = "cgroup check failed";
+ if (unlikely(bad_reason)) {
+ bad_page(page, bad_reason, bad_flags);
return 1;
}
page_cpupid_reset_last(page);
@@ -683,7 +714,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
list_del(&page->lru);
mt = get_freepage_migratetype(page);
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
- __free_one_page(page, zone, 0, mt);
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
if (likely(!is_migrate_isolate_page(page))) {
__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
@@ -695,13 +726,15 @@ static void free_pcppages_bulk(struct zone *zone, int count,
spin_unlock(&zone->lock);
}
-static void free_one_page(struct zone *zone, struct page *page, int order,
+static void free_one_page(struct zone *zone,
+ struct page *page, unsigned long pfn,
+ unsigned int order,
int migratetype)
{
spin_lock(&zone->lock);
zone->pages_scanned = 0;
- __free_one_page(page, zone, order, migratetype);
+ __free_one_page(page, pfn, zone, order, migratetype);
if (unlikely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
spin_unlock(&zone->lock);
@@ -738,15 +771,16 @@ static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
int migratetype;
+ unsigned long pfn = page_to_pfn(page);
if (!free_pages_prepare(page, order))
return;
+ migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
- migratetype = get_pageblock_migratetype(page);
set_freepage_migratetype(page, migratetype);
- free_one_page(page_zone(page), page, order, migratetype);
+ free_one_page(page_zone(page), page, pfn, order, migratetype);
local_irq_restore(flags);
}
@@ -782,9 +816,21 @@ void __init init_cma_reserved_pageblock(struct page *page)
set_page_count(p, 0);
} while (++p, --i);
- set_page_refcounted(page);
set_pageblock_migratetype(page, MIGRATE_CMA);
- __free_pages(page, pageblock_order);
+
+ if (pageblock_order >= MAX_ORDER) {
+ i = pageblock_nr_pages;
+ p = page;
+ do {
+ set_page_refcounted(p);
+ __free_pages(p, MAX_ORDER - 1);
+ p += MAX_ORDER_NR_PAGES;
+ } while (i -= MAX_ORDER_NR_PAGES);
+ } else {
+ set_page_refcounted(page);
+ __free_pages(page, pageblock_order);
+ }
+
adjust_managed_page_count(page, pageblock_nr_pages);
}
#endif
@@ -813,7 +859,7 @@ static inline void expand(struct zone *zone, struct page *page,
area--;
high--;
size >>= 1;
- VM_BUG_ON(bad_range(zone, &page[size]));
+ VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
#ifdef CONFIG_DEBUG_PAGEALLOC
if (high < debug_guardpage_minorder()) {
@@ -843,18 +889,29 @@ static inline void expand(struct zone *zone, struct page *page,
*/
static inline int check_new_page(struct page *page)
{
- if (unlikely(page_mapcount(page) |
- (page->mapping != NULL) |
- (atomic_read(&page->_count) != 0) |
- (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
- (mem_cgroup_bad_page_check(page)))) {
- bad_page(page);
+ const char *bad_reason = NULL;
+ unsigned long bad_flags = 0;
+
+ if (unlikely(page_mapcount(page)))
+ bad_reason = "nonzero mapcount";
+ if (unlikely(page->mapping != NULL))
+ bad_reason = "non-NULL mapping";
+ if (unlikely(atomic_read(&page->_count) != 0))
+ bad_reason = "nonzero _count";
+ if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
+ bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
+ bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
+ }
+ if (unlikely(mem_cgroup_bad_page_check(page)))
+ bad_reason = "cgroup check failed";
+ if (unlikely(bad_reason)) {
+ bad_page(page, bad_reason, bad_flags);
return 1;
}
return 0;
}
-static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
{
int i;
@@ -903,6 +960,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
rmv_page_order(page);
area->nr_free--;
expand(zone, page, order, current_order, area, migratetype);
+ set_freepage_migratetype(page, migratetype);
return page;
}
@@ -955,7 +1013,7 @@ int move_freepages(struct zone *zone,
for (page = start_page; page <= end_page;) {
/* Make sure we are not inadvertently changing nodes */
- VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
@@ -1029,7 +1087,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
/*
* When borrowing from MIGRATE_CMA, we need to release the excess
- * buddy pages to CMA itself.
+ * buddy pages to CMA itself. We also ensure the freepage_migratetype
+ * is set to CMA so it is returned to the correct freelist in case
+ * the page ends up being not actually allocated from the pcp lists.
*/
if (is_migrate_cma(fallback_type))
return fallback_type;
@@ -1062,16 +1122,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
/* Remove an element from the buddy allocator from the fallback list */
static inline struct page *
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
{
struct free_area *area;
- int current_order;
+ unsigned int current_order;
struct page *page;
int migratetype, new_type, i;
/* Find the largest possible block of pages in the other list */
- for (current_order = MAX_ORDER-1; current_order >= order;
- --current_order) {
+ for (current_order = MAX_ORDER-1;
+ current_order >= order && current_order <= MAX_ORDER-1;
+ --current_order) {
for (i = 0;; i++) {
migratetype = fallbacks[start_migratetype][i];
@@ -1097,6 +1158,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
expand(zone, page, order, current_order, area,
new_type);
+ /* The freepage_migratetype may differ from pageblock's
+ * migratetype depending on the decisions in
+ * try_to_steal_freepages. This is OK as long as it does
+ * not differ for MIGRATE_CMA type.
+ */
+ set_freepage_migratetype(page, new_type);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, migratetype, new_type);
@@ -1145,9 +1212,9 @@ retry_reserve:
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
- int migratetype, int cold)
+ int migratetype, bool cold)
{
- int mt = migratetype, i;
+ int i;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
@@ -1164,18 +1231,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* merge IO requests if the physical pages are ordered
* properly.
*/
- if (likely(cold == 0))
+ if (likely(!cold))
list_add(&page->lru, list);
else
list_add_tail(&page->lru, list);
- if (IS_ENABLED(CONFIG_CMA)) {
- mt = get_pageblock_migratetype(page);
- if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
- mt = migratetype;
- }
- set_freepage_migratetype(page, mt);
list = &page->lru;
- if (is_migrate_cma(mt))
+ if (is_migrate_cma(get_freepage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
@@ -1299,7 +1360,7 @@ void mark_free_pages(struct zone *zone)
{
unsigned long pfn, max_zone_pfn;
unsigned long flags;
- int order, t;
+ unsigned int order, t;
struct list_head *curr;
if (zone_is_empty(zone))
@@ -1331,19 +1392,20 @@ void mark_free_pages(struct zone *zone)
/*
* Free a 0-order page
- * cold == 1 ? free a cold page : free a hot page
+ * cold == true ? free a cold page : free a hot page
*/
-void free_hot_cold_page(struct page *page, int cold)
+void free_hot_cold_page(struct page *page, bool cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;
+ unsigned long pfn = page_to_pfn(page);
int migratetype;
if (!free_pages_prepare(page, 0))
return;
- migratetype = get_pageblock_migratetype(page);
+ migratetype = get_pfnblock_migratetype(page, pfn);
set_freepage_migratetype(page, migratetype);
local_irq_save(flags);
__count_vm_event(PGFREE);
@@ -1357,17 +1419,17 @@ void free_hot_cold_page(struct page *page, int cold)
*/
if (migratetype >= MIGRATE_PCPTYPES) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(zone, page, 0, migratetype);
+ free_one_page(zone, page, pfn, 0, migratetype);
goto out;
}
migratetype = MIGRATE_MOVABLE;
}
pcp = &this_cpu_ptr(zone->pageset)->pcp;
- if (cold)
- list_add_tail(&page->lru, &pcp->lists[migratetype]);
- else
+ if (!cold)
list_add(&page->lru, &pcp->lists[migratetype]);
+ else
+ list_add_tail(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
unsigned long batch = ACCESS_ONCE(pcp->batch);
@@ -1382,7 +1444,7 @@ out:
/*
* Free a list of 0-order pages
*/
-void free_hot_cold_page_list(struct list_head *list, int cold)
+void free_hot_cold_page_list(struct list_head *list, bool cold)
{
struct page *page, *next;
@@ -1404,8 +1466,8 @@ void split_page(struct page *page, unsigned int order)
{
int i;
- VM_BUG_ON(PageCompound(page));
- VM_BUG_ON(!page_count(page));
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ VM_BUG_ON_PAGE(!page_count(page), page);
#ifdef CONFIG_KMEMCHECK
/*
@@ -1494,12 +1556,12 @@ int split_free_page(struct page *page)
*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
- struct zone *zone, int order, gfp_t gfp_flags,
- int migratetype)
+ struct zone *zone, unsigned int order,
+ gfp_t gfp_flags, int migratetype)
{
unsigned long flags;
struct page *page;
- int cold = !!(gfp_flags & __GFP_COLD);
+ bool cold = ((gfp_flags & __GFP_COLD) != 0);
again:
if (likely(order == 0)) {
@@ -1544,15 +1606,16 @@ again:
if (!page)
goto failed;
__mod_zone_freepage_state(zone, -(1 << order),
- get_pageblock_migratetype(page));
+ get_freepage_migratetype(page));
}
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
- VM_BUG_ON(bad_range(zone, page));
+ VM_BUG_ON_PAGE(bad_range(zone, page), page);
if (prep_new_page(page, order, gfp_flags))
goto again;
return page;
@@ -1643,8 +1706,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
* Return true if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
-static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags, long free_pages)
+static bool __zone_watermark_ok(struct zone *z, unsigned int order,
+ unsigned long mark, int classzone_idx, int alloc_flags,
+ long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
@@ -1678,15 +1742,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
return true;
}
-bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
zone_page_state(z, NR_FREE_PAGES));
}
-bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
+ unsigned long mark, int classzone_idx, int alloc_flags)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
@@ -1816,23 +1880,13 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
static bool zone_local(struct zone *local_zone, struct zone *zone)
{
- return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE;
+ return local_zone->node == zone->node;
}
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
- return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
-}
-
-static void __paginginit init_zone_allows_reclaim(int nid)
-{
- int i;
-
- for_each_online_node(i)
- if (node_distance(nid, i) <= RECLAIM_DISTANCE)
- node_set(i, NODE_DATA(nid)->reclaim_nodes);
- else
- zone_reclaim_mode = 1;
+ return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
+ RECLAIM_DISTANCE;
}
#else /* CONFIG_NUMA */
@@ -1866,9 +1920,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
return true;
}
-static inline void init_zone_allows_reclaim(int nid)
-{
-}
#endif /* CONFIG_NUMA */
/*
@@ -1878,17 +1929,17 @@ static inline void init_zone_allows_reclaim(int nid)
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
- struct zone *preferred_zone, int migratetype)
+ struct zone *preferred_zone, int classzone_idx, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
- int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
+ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
+ (gfp_mask & __GFP_WRITE);
- classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
@@ -1901,30 +1952,20 @@ zonelist_scan:
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
- if ((alloc_flags & ALLOC_CPUSET) &&
+ if (cpusets_enabled() &&
+ (alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
- BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
- if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
- goto try_this_zone;
/*
* Distribute pages in proportion to the individual
* zone size to ensure fair page aging. The zone a
* page was allocated in should have no effect on the
* time the page has in memory before being reclaimed.
- *
- * When zone_reclaim_mode is enabled, try to stay in
- * local zones in the fastpath. If that fails, the
- * slowpath is entered, which will do another pass
- * starting with the local zones, but ultimately fall
- * back to remote zones that do not partake in the
- * fairness round-robin cycle of this zonelist.
*/
- if (alloc_flags & ALLOC_WMARK_LOW) {
- if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+ if (alloc_flags & ALLOC_FAIR) {
+ if (!zone_local(preferred_zone, zone))
continue;
- if (zone_reclaim_mode &&
- !zone_local(preferred_zone, zone))
+ if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
continue;
}
/*
@@ -1953,15 +1994,19 @@ zonelist_scan:
* will require awareness of zones in the
* dirty-throttling and the flusher threads.
*/
- if ((alloc_flags & ALLOC_WMARK_LOW) &&
- (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
- goto this_zone_full;
+ if (consider_zone_dirty && !zone_dirty_ok(zone))
+ continue;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags)) {
int ret;
+ /* Checked here to keep the fast path fast */
+ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
+ if (alloc_flags & ALLOC_NO_WATERMARKS)
+ goto try_this_zone;
+
if (IS_ENABLED(CONFIG_NUMA) &&
!did_zlc_setup && nr_online_nodes > 1) {
/*
@@ -2023,7 +2068,7 @@ try_this_zone:
if (page)
break;
this_zone_full:
- if (IS_ENABLED(CONFIG_NUMA))
+ if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
zlc_mark_zone_full(zonelist, z);
}
@@ -2073,13 +2118,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
return;
/*
- * Walking all memory to count page types is very expensive and should
- * be inhibited in non-blockable contexts.
- */
- if (!(gfp_mask & __GFP_WAIT))
- filter |= SHOW_MEM_FILTER_PAGE_COUNT;
-
- /*
* This documents exceptions given to allocations in certain
* contexts that are allowed to allocate outside current's set
* of allowed nodes.
@@ -2159,7 +2197,7 @@ static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int migratetype)
+ int classzone_idx, int migratetype)
{
struct page *page;
@@ -2177,7 +2215,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
order, zonelist, high_zoneidx,
ALLOC_WMARK_HIGH|ALLOC_CPUSET,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (page)
goto out;
@@ -2212,7 +2250,7 @@ static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, bool sync_migration,
+ int classzone_idx, int migratetype, enum migrate_mode mode,
bool *contended_compaction, bool *deferred_compaction,
unsigned long *did_some_progress)
{
@@ -2226,7 +2264,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
- nodemask, sync_migration,
+ nodemask, mode,
contended_compaction);
current->flags &= ~PF_MEMALLOC;
@@ -2240,13 +2278,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
page = get_page_from_freelist(gfp_mask, nodemask,
order, zonelist, high_zoneidx,
alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (page) {
preferred_zone->compact_blockskip_flush = false;
- preferred_zone->compact_considered = 0;
- preferred_zone->compact_defer_shift = 0;
- if (order >= preferred_zone->compact_order_failed)
- preferred_zone->compact_order_failed = order + 1;
+ compaction_defer_reset(preferred_zone, order, true);
count_vm_event(COMPACTSUCCESS);
return page;
}
@@ -2262,7 +2297,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
* As async compaction considers a subset of pageblocks, only
* defer if the failure was a sync compaction failure.
*/
- if (sync_migration)
+ if (mode != MIGRATE_ASYNC)
defer_compaction(preferred_zone, order);
cond_resched();
@@ -2275,9 +2310,9 @@ static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, bool sync_migration,
- bool *contended_compaction, bool *deferred_compaction,
- unsigned long *did_some_progress)
+ int classzone_idx, int migratetype,
+ enum migrate_mode mode, bool *contended_compaction,
+ bool *deferred_compaction, unsigned long *did_some_progress)
{
return NULL;
}
@@ -2316,7 +2351,7 @@ static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress)
+ int classzone_idx, int migratetype, unsigned long *did_some_progress)
{
struct page *page = NULL;
bool drained = false;
@@ -2334,7 +2369,8 @@ retry:
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx,
+ migratetype);
/*
* If an allocation failed after direct reclaim, it could be because
@@ -2357,14 +2393,14 @@ static inline struct page *
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int migratetype)
+ int classzone_idx, int migratetype)
{
struct page *page;
do {
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (!page && gfp_mask & __GFP_NOFAIL)
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
@@ -2373,37 +2409,45 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
return page;
}
-static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist,
- enum zone_type high_zoneidx,
- struct zone *preferred_zone)
+static void reset_alloc_batches(struct zonelist *zonelist,
+ enum zone_type high_zoneidx,
+ struct zone *preferred_zone)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
- if (!(gfp_mask & __GFP_NO_KSWAPD))
- wakeup_kswapd(zone, order, zone_idx(preferred_zone));
/*
* Only reset the batches of zones that were actually
- * considered in the fast path, we don't want to
- * thrash fairness information for zones that are not
+ * considered in the fairness pass, we don't want to
+ * trash fairness information for zones that are not
* actually part of this zonelist's round-robin cycle.
*/
- if (zone_reclaim_mode && !zone_local(preferred_zone, zone))
+ if (!zone_local(preferred_zone, zone))
continue;
mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) -
- low_wmark_pages(zone) -
- zone_page_state(zone, NR_ALLOC_BATCH));
+ high_wmark_pages(zone) - low_wmark_pages(zone) -
+ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
}
}
+static void wake_all_kswapds(unsigned int order,
+ struct zonelist *zonelist,
+ enum zone_type high_zoneidx,
+ struct zone *preferred_zone)
+{
+ struct zoneref *z;
+ struct zone *zone;
+
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+ wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+}
+
static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
- const gfp_t wait = gfp_mask & __GFP_WAIT;
+ const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -2412,20 +2456,20 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+ * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
*/
alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
- if (!wait) {
+ if (atomic) {
/*
- * Not worth trying to allocate harder for
- * __GFP_NOMEMALLOC even if it can't schedule.
+ * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
+ * if it can't schedule.
*/
- if (!(gfp_mask & __GFP_NOMEMALLOC))
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
alloc_flags |= ALLOC_HARDER;
/*
- * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
+ * comment for __cpuset_node_allowed_softwall().
*/
alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && !in_interrupt())
@@ -2457,14 +2501,14 @@ static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int migratetype)
+ int classzone_idx, int migratetype)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
- bool sync_migration = false;
+ enum migrate_mode migration_mode = MIGRATE_ASYNC;
bool deferred_compaction = false;
bool contended_compaction = false;
@@ -2488,12 +2532,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
* over allocated.
*/
if (IS_ENABLED(CONFIG_NUMA) &&
- (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+ (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
restart:
- prepare_slowpath(gfp_mask, order, zonelist,
- high_zoneidx, preferred_zone);
+ if (!(gfp_mask & __GFP_NO_KSWAPD))
+ wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2506,15 +2550,18 @@ restart:
* Find the true preferred zone if the allocation is unconstrained by
* cpusets.
*/
- if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
- first_zones_zonelist(zonelist, high_zoneidx, NULL,
- &preferred_zone);
+ if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
+ struct zoneref *preferred_zoneref;
+ preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
+ NULL, &preferred_zone);
+ classzone_idx = zonelist_zone_idx(preferred_zoneref);
+ }
rebalance:
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (page)
goto got_pg;
@@ -2529,15 +2576,22 @@ rebalance:
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (page) {
goto got_pg;
}
}
/* Atomic allocations - we can't balance anything */
- if (!wait)
+ if (!wait) {
+ /*
+ * All existing users of the deprecated __GFP_NOFAIL are
+ * blockable, so warn of any new users that actually allow this
+ * type of allocation to fail.
+ */
+ WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
goto nopage;
+ }
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
@@ -2551,17 +2605,23 @@ rebalance:
* Try direct compaction. The first pass is asynchronous. Subsequent
* attempts after direct reclaim are synchronous
*/
- page = __alloc_pages_direct_compact(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask,
- alloc_flags, preferred_zone,
- migratetype, sync_migration,
- &contended_compaction,
+ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+ high_zoneidx, nodemask, alloc_flags,
+ preferred_zone,
+ classzone_idx, migratetype,
+ migration_mode, &contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
goto got_pg;
- sync_migration = true;
+
+ /*
+ * It can become very expensive to allocate transparent hugepages at
+ * fault, so use asynchronous memory compaction for THP unless it is
+ * khugepaged trying to collapse.
+ */
+ if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
+ migration_mode = MIGRATE_SYNC_LIGHT;
/*
* If compaction is deferred for high-order allocations, it is because
@@ -2578,7 +2638,8 @@ rebalance:
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress);
+ classzone_idx, migratetype,
+ &did_some_progress);
if (page)
goto got_pg;
@@ -2597,7 +2658,7 @@ rebalance:
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
- migratetype);
+ classzone_idx, migratetype);
if (page)
goto got_pg;
@@ -2636,12 +2697,11 @@ rebalance:
* direct reclaim and reclaim/compaction depends on compaction
* being called after reclaim so call directly if necessary
*/
- page = __alloc_pages_direct_compact(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask,
- alloc_flags, preferred_zone,
- migratetype, sync_migration,
- &contended_compaction,
+ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+ high_zoneidx, nodemask, alloc_flags,
+ preferred_zone,
+ classzone_idx, migratetype,
+ migration_mode, &contended_compaction,
&deferred_compaction,
&did_some_progress);
if (page)
@@ -2667,11 +2727,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
+ struct zoneref *preferred_zoneref;
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
- int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
- struct mem_cgroup *memcg = NULL;
+ int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
+ int classzone_idx;
gfp_mask &= gfp_allowed_mask;
@@ -2690,33 +2751,44 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
- /*
- * Will only have any effect when __GFP_KMEMCG is set. This is
- * verified in the (always inline) callee
- */
- if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
- return NULL;
-
retry_cpuset:
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
/* The preferred zone is used for statistics later */
- first_zones_zonelist(zonelist, high_zoneidx,
+ preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
if (!preferred_zone)
goto out;
+ classzone_idx = zonelist_zone_idx(preferred_zoneref);
#ifdef CONFIG_CMA
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
+retry:
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
if (unlikely(!page)) {
/*
+ * The first pass makes sure allocations are spread
+ * fairly within the local node. However, the local
+ * node might have free pages left after the fairness
+ * batches are exhausted, and remote zones haven't
+ * even been considered yet. Try once more without
+ * fairness, and include remote zones now, before
+ * entering the slowpath and waking kswapd: prefer
+ * spilling to a remote zone over swapping locally.
+ */
+ if (alloc_flags & ALLOC_FAIR) {
+ reset_alloc_batches(zonelist, high_zoneidx,
+ preferred_zone);
+ alloc_flags &= ~ALLOC_FAIR;
+ goto retry;
+ }
+ /*
* Runtime PM, block IO and its error handling path
* can deadlock because I/O on the device might not
* complete.
@@ -2724,7 +2796,7 @@ retry_cpuset:
gfp_mask = memalloc_noio_flags(gfp_mask);
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
- preferred_zone, migratetype);
+ preferred_zone, classzone_idx, migratetype);
}
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
@@ -2736,11 +2808,9 @@ out:
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+ if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
- memcg_kmem_commit_charge(page, memcg, order);
-
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2775,7 +2845,7 @@ void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0)
- free_hot_cold_page(page, 0);
+ free_hot_cold_page(page, false);
else
__free_pages_ok(page, order);
}
@@ -2794,27 +2864,51 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
/*
- * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
- * pages allocated with __GFP_KMEMCG.
- *
- * Those pages are accounted to a particular memcg, embedded in the
- * corresponding page_cgroup. To avoid adding a hit in the allocator to search
- * for that information only to find out that it is NULL for users who have no
- * interest in that whatsoever, we provide these functions.
+ * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
+ * of the current memory cgroup.
*
- * The caller knows better which flags it relies on.
+ * It should be used when the caller would like to use kmalloc, but since the
+ * allocation is large, it has to fall back to the page allocator.
*/
-void __free_memcg_kmem_pages(struct page *page, unsigned int order)
+struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
+{
+ struct page *page;
+ struct mem_cgroup *memcg = NULL;
+
+ if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+ return NULL;
+ page = alloc_pages(gfp_mask, order);
+ memcg_kmem_commit_charge(page, memcg, order);
+ return page;
+}
+
+struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
+{
+ struct page *page;
+ struct mem_cgroup *memcg = NULL;
+
+ if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+ return NULL;
+ page = alloc_pages_node(nid, gfp_mask, order);
+ memcg_kmem_commit_charge(page, memcg, order);
+ return page;
+}
+
+/*
+ * __free_kmem_pages and free_kmem_pages will free pages allocated with
+ * alloc_kmem_pages.
+ */
+void __free_kmem_pages(struct page *page, unsigned int order)
{
memcg_kmem_uncharge_pages(page, order);
__free_pages(page, order);
}
-void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
+void free_kmem_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
VM_BUG_ON(!virt_addr_valid((void *)addr));
- __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
+ __free_kmem_pages(virt_to_page((void *)addr), order);
}
}
@@ -3004,9 +3098,9 @@ bool skip_free_areas_node(unsigned int flags, int nid)
goto out;
do {
- cpuset_mems_cookie = get_mems_allowed();
+ cpuset_mems_cookie = read_mems_allowed_begin();
ret = !node_isset(nid, cpuset_current_mems_allowed);
- } while (!put_mems_allowed(cpuset_mems_cookie));
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
out:
return ret;
}
@@ -3308,7 +3402,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order);
/*
* sysctl handler for numa_zonelist_order
*/
-int numa_zonelist_order_handler(ctl_table *table, int write,
+int numa_zonelist_order_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *ppos)
{
@@ -3902,6 +3996,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
struct page *page;
unsigned long block_migratetype;
int reserve;
+ int old_reserve;
/*
* Get the start pfn, end pfn and the number of blocks to reserve
@@ -3923,6 +4018,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
* future allocation of hugepages at runtime.
*/
reserve = min(2, reserve);
+ old_reserve = zone->nr_migrate_reserve_block;
+
+ /* When memory hot-add, we almost always need to do nothing */
+ if (reserve == old_reserve)
+ return;
+ zone->nr_migrate_reserve_block = reserve;
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
if (!pfn_valid(pfn))
@@ -3960,6 +4061,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
reserve--;
continue;
}
+ } else if (!old_reserve) {
+ /*
+ * At boot time we don't need to scan the whole zone
+ * for turning off MIGRATE_RESERVE.
+ */
+ break;
}
/*
@@ -4039,7 +4146,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
static void __meminit zone_init_free_lists(struct zone *zone)
{
- int order, t;
+ unsigned int order, t;
for_each_migratetype_order(order, t) {
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
@@ -4051,7 +4158,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif
-static int __meminit zone_batchsize(struct zone *zone)
+static int zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
int batch;
@@ -4167,8 +4274,8 @@ static void pageset_set_high(struct per_cpu_pageset *p,
pageset_update(&p->pcp, high, batch);
}
-static void __meminit pageset_set_high_and_batch(struct zone *zone,
- struct per_cpu_pageset *pcp)
+static void pageset_set_high_and_batch(struct zone *zone,
+ struct per_cpu_pageset *pcp)
{
if (percpu_pagelist_fraction)
pageset_set_high(pcp,
@@ -4210,7 +4317,6 @@ static noinline __init_refok
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
int i;
- struct pglist_data *pgdat = zone->zone_pgdat;
size_t alloc_size;
/*
@@ -4226,7 +4332,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
if (!slab_is_available()) {
zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node_nopanic(pgdat, alloc_size);
+ memblock_virt_alloc_node_nopanic(
+ alloc_size, zone->zone_pgdat->node_id);
} else {
/*
* This case means that a zone whose size was 0 gets new memory
@@ -4293,9 +4400,6 @@ int __meminit init_currently_empty_zone(struct zone *zone,
#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
/*
* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
- * Architectures may implement their own version but if add_active_range()
- * was used and there are no special requirements, this is a convenient
- * alternative
*/
int __meminit __early_pfn_to_nid(unsigned long pfn)
{
@@ -4346,13 +4450,13 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
#endif
/**
- * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
+ * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
- * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
+ * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
*
- * If an architecture guarantees that all ranges registered with
- * add_active_ranges() contain no holes and may be freed, this
- * this function may be used instead of calling free_bootmem() manually.
+ * If an architecture guarantees that all ranges registered contain no holes
+ * and may be freed, this this function may be used instead of calling
+ * memblock_free_early_nid() manually.
*/
void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
{
@@ -4364,9 +4468,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
end_pfn = min(end_pfn, max_low_pfn);
if (start_pfn < end_pfn)
- free_bootmem_node(NODE_DATA(this_nid),
- PFN_PHYS(start_pfn),
- (end_pfn - start_pfn) << PAGE_SHIFT);
+ memblock_free_early_nid(PFN_PHYS(start_pfn),
+ (end_pfn - start_pfn) << PAGE_SHIFT,
+ this_nid);
}
}
@@ -4374,9 +4478,8 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
* sparse_memory_present_with_active_regions - Call memory_present for each active range
* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
*
- * If an architecture guarantees that all ranges registered with
- * add_active_ranges() contain no holes and may be freed, this
- * function may be used instead of calling memory_present() manually.
+ * If an architecture guarantees that all ranges registered contain no holes and may
+ * be freed, this function may be used instead of calling memory_present() manually.
*/
void __init sparse_memory_present_with_active_regions(int nid)
{
@@ -4394,7 +4497,7 @@ void __init sparse_memory_present_with_active_regions(int nid)
* @end_pfn: Passed by reference. On return, it will have the node end_pfn.
*
* It returns the start and end page frame of a node based on information
- * provided by an arch calling add_active_range(). If called for a node
+ * provided by memblock_set_node(). If called for a node
* with no available memory, a warning is printed and the start and end
* PFNs will be 0.
*/
@@ -4637,8 +4740,9 @@ static void __init setup_usemap(struct pglist_data *pgdat,
unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
zone->pageblock_flags = NULL;
if (usemapsize)
- zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
- usemapsize);
+ zone->pageblock_flags =
+ memblock_virt_alloc_node_nopanic(usemapsize,
+ pgdat->node_id);
}
#else
static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -4832,7 +4936,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
size = (end - start) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
- map = alloc_bootmem_node_nopanic(pgdat, size);
+ map = memblock_virt_alloc_node_nopanic(size,
+ pgdat->node_id);
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -4862,7 +4967,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
- init_zone_allows_reclaim(nid);
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
#endif
@@ -4970,7 +5074,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
* find_min_pfn_with_active_regions - Find the minimum PFN registered
*
* It returns the minimum PFN based on information provided via
- * add_active_range().
+ * memblock_set_node().
*/
unsigned long __init find_min_pfn_with_active_regions(void)
{
@@ -5013,9 +5117,33 @@ static void __init find_zone_movable_pfns_for_nodes(void)
nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+ struct memblock_region *r;
+
+ /* Need to find movable_zone earlier when movable_node is specified. */
+ find_usable_zone_for_movable();
+
+ /*
+ * If movable_node is specified, ignore kernelcore and movablecore
+ * options.
+ */
+ if (movable_node_is_enabled()) {
+ for_each_memblock(memory, r) {
+ if (!memblock_is_hotpluggable(r))
+ continue;
+
+ nid = r->nid;
+
+ usable_startpfn = PFN_DOWN(r->base);
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ goto out2;
+ }
/*
- * If movablecore was specified, calculate what size of
+ * If movablecore=nn[KMG] was specified, calculate what size of
* kernelcore that corresponds so that memory usable for
* any allocation type is evenly spread. If both kernelcore
* and movablecore are specified, then the value of kernelcore
@@ -5041,7 +5169,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)
goto out;
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
- find_usable_zone_for_movable();
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
restart:
@@ -5132,6 +5259,7 @@ restart:
if (usable_nodes && required_kernelcore > usable_nodes)
goto restart;
+out2:
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
for (nid = 0; nid < MAX_NUMNODES; nid++)
zone_movable_pfn[nid] =
@@ -5167,7 +5295,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
* @max_zone_pfn: an array of max PFNs for each zone
*
* This will call free_area_init_node() for each active node in the system.
- * Using the page ranges provided by add_active_range(), the size of each
+ * Using the page ranges provided by memblock_set_node(), the size of each
* zone in each node and their holes is calculated. If the maximum PFN
* between two adjacent zones match, it is assumed that the zone is empty.
* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
@@ -5690,10 +5818,15 @@ module_init(init_per_zone_wmark_min)
* that we can call two helper functions whenever min_free_kbytes
* changes.
*/
-int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
+int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, buffer, length, ppos);
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
if (write) {
user_min_free_kbytes = min_free_kbytes;
setup_per_zone_wmarks();
@@ -5702,7 +5835,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
}
#ifdef CONFIG_NUMA
-int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
+int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
@@ -5718,7 +5851,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
return 0;
}
-int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
+int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
@@ -5744,7 +5877,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
* minimum watermarks. The lowmem reserve ratio can only make sense
* if in function of the boot time zone sizes.
*/
-int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
+int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec_minmax(table, write, buffer, length, ppos);
@@ -5757,27 +5890,42 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
* cpu. It is the fraction of total pages in each zone that a hot per cpu
* pagelist can have before it gets flushed back to buddy allocator.
*/
-int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
- unsigned int cpu;
+ int old_percpu_pagelist_fraction;
int ret;
+ mutex_lock(&pcp_batch_high_lock);
+ old_percpu_pagelist_fraction = percpu_pagelist_fraction;
+
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (!write || (ret < 0))
- return ret;
+ if (!write || ret < 0)
+ goto out;
+
+ /* Sanity checking to avoid pcp imbalance */
+ if (percpu_pagelist_fraction &&
+ percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
+ percpu_pagelist_fraction = old_percpu_pagelist_fraction;
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* No change? */
+ if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
+ goto out;
- mutex_lock(&pcp_batch_high_lock);
for_each_populated_zone(zone) {
- unsigned long high;
- high = zone->managed_pages / percpu_pagelist_fraction;
+ unsigned int cpu;
+
for_each_possible_cpu(cpu)
- pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
- high);
+ pageset_set_high_and_batch(zone,
+ per_cpu_ptr(zone->pageset, cpu));
}
+out:
mutex_unlock(&pcp_batch_high_lock);
- return 0;
+ return ret;
}
int hashdist = HASHDIST_DEFAULT;
@@ -5858,7 +6006,7 @@ void *__init alloc_large_system_hash(const char *tablename,
do {
size = bucketsize << log2qty;
if (flags & HASH_EARLY)
- table = alloc_bootmem_nopanic(size);
+ table = memblock_virt_alloc_nopanic(size, 0);
else if (hashdist)
table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
else {
@@ -5914,59 +6062,73 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
}
/**
- * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
* @page: The page within the block of interest
- * @start_bitidx: The first bit of interest to retrieve
- * @end_bitidx: The last bit of interest
- * returns pageblock_bits flags
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest to retrieve
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
*/
-unsigned long get_pageblock_flags_group(struct page *page,
- int start_bitidx, int end_bitidx)
+unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
{
struct zone *zone;
unsigned long *bitmap;
- unsigned long pfn, bitidx;
- unsigned long flags = 0;
- unsigned long value = 1;
+ unsigned long bitidx, word_bitidx;
+ unsigned long word;
zone = page_zone(page);
- pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
- for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
- if (test_bit(bitidx + start_bitidx, bitmap))
- flags |= value;
-
- return flags;
+ word = bitmap[word_bitidx];
+ bitidx += end_bitidx;
+ return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
}
/**
- * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
* @page: The page within the block of interest
- * @start_bitidx: The first bit of interest
- * @end_bitidx: The last bit of interest
* @flags: The flags to set
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest
+ * @mask: mask of bits that the caller is interested in
*/
-void set_pageblock_flags_group(struct page *page, unsigned long flags,
- int start_bitidx, int end_bitidx)
+void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
+ unsigned long pfn,
+ unsigned long end_bitidx,
+ unsigned long mask)
{
struct zone *zone;
unsigned long *bitmap;
- unsigned long pfn, bitidx;
- unsigned long value = 1;
+ unsigned long bitidx, word_bitidx;
+ unsigned long old_word, word;
+
+ BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
zone = page_zone(page);
- pfn = page_to_pfn(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
- VM_BUG_ON(!zone_spans_pfn(zone, pfn));
+ word_bitidx = bitidx / BITS_PER_LONG;
+ bitidx &= (BITS_PER_LONG-1);
- for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
- if (flags & value)
- __set_bit(bitidx + start_bitidx, bitmap);
- else
- __clear_bit(bitidx + start_bitidx, bitmap);
+ VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
+
+ bitidx += end_bitidx;
+ mask <<= (BITS_PER_LONG - bitidx - 1);
+ flags <<= (BITS_PER_LONG - bitidx - 1);
+
+ word = ACCESS_ONCE(bitmap[word_bitidx]);
+ for (;;) {
+ old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+ if (word == old_word)
+ break;
+ word = old_word;
+ }
}
/*
@@ -6126,7 +6288,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
cc->nr_migratepages -= nr_reclaimed;
ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
- 0, MIGRATE_SYNC, MR_CMA);
+ NULL, 0, cc->mode, MR_CMA);
}
if (ret < 0) {
putback_movable_pages(&cc->migratepages);
@@ -6165,7 +6327,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
.nr_migratepages = 0,
.order = -1,
.zone = page_zone(pfn_to_page(start)),
- .sync = true,
+ .mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
};
INIT_LIST_HEAD(&cc.migratepages);
@@ -6320,7 +6482,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
struct page *page;
struct zone *zone;
- int order, i;
+ unsigned int order, i;
unsigned long pfn;
unsigned long flags;
/* find the first valid pfn */
@@ -6372,7 +6534,7 @@ bool is_free_buddy_page(struct page *page)
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
unsigned long flags;
- int order;
+ unsigned int order;
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
@@ -6458,12 +6620,25 @@ static void dump_page_flags(unsigned long flags)
printk(")\n");
}
-void dump_page(struct page *page)
+void dump_page_badflags(struct page *page, const char *reason,
+ unsigned long badflags)
{
printk(KERN_ALERT
"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
dump_page_flags(page->flags);
+ if (reason)
+ pr_alert("page dumped because: %s\n", reason);
+ if (page->flags & badflags) {
+ pr_alert("bad because of flags:\n");
+ dump_page_flags(page->flags & badflags);
+ }
mem_cgroup_print_bad_page(page);
}
+
+void dump_page(struct page *page, const char *reason)
+{
+ dump_page_badflags(page, reason, 0);
+}
+EXPORT_SYMBOL(dump_page);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 6d757e3a872..3708264d283 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid)
table_size = sizeof(struct page_cgroup) * nr_pages;
- base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
- table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ base = memblock_virt_alloc_try_nid_nopanic(
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
if (!base)
return -ENOMEM;
NODE_DATA(nid)->node_page_cgroup = base;
@@ -174,7 +175,7 @@ static void free_page_cgroup(void *addr)
}
}
-void __free_page_cgroup(unsigned long pfn)
+static void __free_page_cgroup(unsigned long pfn)
{
struct mem_section *ms;
struct page_cgroup *base;
@@ -187,9 +188,9 @@ void __free_page_cgroup(unsigned long pfn)
ms->page_cgroup = NULL;
}
-int __meminit online_page_cgroup(unsigned long start_pfn,
- unsigned long nr_pages,
- int nid)
+static int __meminit online_page_cgroup(unsigned long start_pfn,
+ unsigned long nr_pages,
+ int nid)
{
unsigned long start, end, pfn;
int fail = 0;
@@ -222,8 +223,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
return -ENOMEM;
}
-int __meminit offline_page_cgroup(unsigned long start_pfn,
- unsigned long nr_pages, int nid)
+static int __meminit offline_page_cgroup(unsigned long start_pfn,
+ unsigned long nr_pages, int nid)
{
unsigned long start, end, pfn;
@@ -451,7 +452,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
* lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
* @ent: swap entry to be looked up.
*
- * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
+ * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
*/
unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
{
diff --git a/mm/page_io.c b/mm/page_io.c
index 8c79a4764be..955db8b0d49 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -31,13 +31,13 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
bio = bio_alloc(gfp_flags, 1);
if (bio) {
- bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
- bio->bi_sector <<= PAGE_SHIFT - 9;
+ bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev);
+ bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
bio->bi_io_vec[0].bv_page = page;
bio->bi_io_vec[0].bv_len = PAGE_SIZE;
bio->bi_io_vec[0].bv_offset = 0;
bio->bi_vcnt = 1;
- bio->bi_size = PAGE_SIZE;
+ bio->bi_iter.bi_size = PAGE_SIZE;
bio->bi_end_io = end_io;
}
return bio;
@@ -62,7 +62,7 @@ void end_swap_bio_write(struct bio *bio, int err)
printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_sector);
+ (unsigned long long)bio->bi_iter.bi_sector);
ClearPageReclaim(page);
}
end_page_writeback(page);
@@ -80,7 +80,7 @@ void end_swap_bio_read(struct bio *bio, int err)
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_sector);
+ (unsigned long long)bio->bi_iter.bi_sector);
goto out;
}
@@ -248,21 +248,34 @@ out:
return ret;
}
+static sector_t swap_page_sector(struct page *page)
+{
+ return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9);
+}
+
int __swap_writepage(struct page *page, struct writeback_control *wbc,
void (*end_write_func)(struct bio *, int))
{
struct bio *bio;
- int ret = 0, rw = WRITE;
+ int ret, rw = WRITE;
struct swap_info_struct *sis = page_swap_info(page);
if (sis->flags & SWP_FILE) {
struct kiocb kiocb;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
- struct iovec iov = {
- .iov_base = kmap(page),
- .iov_len = PAGE_SIZE,
+ struct bio_vec bv = {
+ .bv_page = page,
+ .bv_len = PAGE_SIZE,
+ .bv_offset = 0
+ };
+ struct iov_iter from = {
+ .type = ITER_BVEC | WRITE,
+ .count = PAGE_SIZE,
+ .iov_offset = 0,
+ .nr_segs = 1,
};
+ from.bvec = &bv; /* older gcc versions are broken */
init_sync_kiocb(&kiocb, swap_file);
kiocb.ki_pos = page_file_offset(page);
@@ -270,10 +283,9 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
set_page_writeback(page);
unlock_page(page);
- ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
- &kiocb, &iov,
- kiocb.ki_pos, 1);
- kunmap(page);
+ ret = mapping->a_ops->direct_IO(ITER_BVEC | WRITE,
+ &kiocb, &from,
+ kiocb.ki_pos);
if (ret == PAGE_SIZE) {
count_vm_event(PSWPOUT);
ret = 0;
@@ -297,6 +309,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
return ret;
}
+ ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
+ if (!ret) {
+ count_vm_event(PSWPOUT);
+ return 0;
+ }
+
+ ret = 0;
bio = get_swap_bio(GFP_NOIO, page, end_write_func);
if (bio == NULL) {
set_page_dirty(page);
@@ -320,8 +339,8 @@ int swap_readpage(struct page *page)
int ret = 0;
struct swap_info_struct *sis = page_swap_info(page);
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(PageUptodate(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageUptodate(page), page);
if (frontswap_load(page) == 0) {
SetPageUptodate(page);
unlock_page(page);
@@ -338,6 +357,13 @@ int swap_readpage(struct page *page)
return ret;
}
+ ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
+ if (!ret) {
+ count_vm_event(PSWPIN);
+ return 0;
+ }
+
+ ret = 0;
bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
diff --git a/mm/percpu.c b/mm/percpu.c
index 0d10defe951..2ddf9a990db 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -102,10 +102,11 @@ struct pcpu_chunk {
int free_size; /* free bytes in the chunk */
int contig_hint; /* max contiguous size hint */
void *base_addr; /* base address of this chunk */
- int map_used; /* # of map entries used */
+ int map_used; /* # of map entries used before the sentry */
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
void *data; /* chunk data */
+ int first_free; /* no free below this */
bool immutable; /* no [de]population allowed */
unsigned long populated[]; /* populated bitmap */
};
@@ -356,11 +357,11 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
{
int new_alloc;
- if (chunk->map_alloc >= chunk->map_used + 2)
+ if (chunk->map_alloc >= chunk->map_used + 3)
return 0;
new_alloc = PCPU_DFL_MAP_ALLOC;
- while (new_alloc < chunk->map_used + 2)
+ while (new_alloc < chunk->map_used + 3)
new_alloc *= 2;
return new_alloc;
@@ -418,48 +419,6 @@ out_unlock:
}
/**
- * pcpu_split_block - split a map block
- * @chunk: chunk of interest
- * @i: index of map block to split
- * @head: head size in bytes (can be 0)
- * @tail: tail size in bytes (can be 0)
- *
- * Split the @i'th map block into two or three blocks. If @head is
- * non-zero, @head bytes block is inserted before block @i moving it
- * to @i+1 and reducing its size by @head bytes.
- *
- * If @tail is non-zero, the target block, which can be @i or @i+1
- * depending on @head, is reduced by @tail bytes and @tail byte block
- * is inserted after the target block.
- *
- * @chunk->map must have enough free slots to accommodate the split.
- *
- * CONTEXT:
- * pcpu_lock.
- */
-static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
- int head, int tail)
-{
- int nr_extra = !!head + !!tail;
-
- BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);
-
- /* insert new subblocks */
- memmove(&chunk->map[i + nr_extra], &chunk->map[i],
- sizeof(chunk->map[0]) * (chunk->map_used - i));
- chunk->map_used += nr_extra;
-
- if (head) {
- chunk->map[i + 1] = chunk->map[i] - head;
- chunk->map[i++] = head;
- }
- if (tail) {
- chunk->map[i++] -= tail;
- chunk->map[i] = tail;
- }
-}
-
-/**
* pcpu_alloc_area - allocate area from a pcpu_chunk
* @chunk: chunk of interest
* @size: wanted size in bytes
@@ -483,19 +442,27 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
int oslot = pcpu_chunk_slot(chunk);
int max_contig = 0;
int i, off;
+ bool seen_free = false;
+ int *p;
- for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
- bool is_last = i + 1 == chunk->map_used;
+ for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) {
int head, tail;
+ int this_size;
+
+ off = *p;
+ if (off & 1)
+ continue;
/* extra for alignment requirement */
head = ALIGN(off, align) - off;
- BUG_ON(i == 0 && head != 0);
- if (chunk->map[i] < 0)
- continue;
- if (chunk->map[i] < head + size) {
- max_contig = max(chunk->map[i], max_contig);
+ this_size = (p[1] & ~1) - off;
+ if (this_size < head + size) {
+ if (!seen_free) {
+ chunk->first_free = i;
+ seen_free = true;
+ }
+ max_contig = max(this_size, max_contig);
continue;
}
@@ -505,44 +472,59 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
* than sizeof(int), which is very small but isn't too
* uncommon for percpu allocations.
*/
- if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
- if (chunk->map[i - 1] > 0)
- chunk->map[i - 1] += head;
- else {
- chunk->map[i - 1] -= head;
+ if (head && (head < sizeof(int) || !(p[-1] & 1))) {
+ *p = off += head;
+ if (p[-1] & 1)
chunk->free_size -= head;
- }
- chunk->map[i] -= head;
- off += head;
+ else
+ max_contig = max(*p - p[-1], max_contig);
+ this_size -= head;
head = 0;
}
/* if tail is small, just keep it around */
- tail = chunk->map[i] - head - size;
- if (tail < sizeof(int))
+ tail = this_size - head - size;
+ if (tail < sizeof(int)) {
tail = 0;
+ size = this_size - head;
+ }
/* split if warranted */
if (head || tail) {
- pcpu_split_block(chunk, i, head, tail);
+ int nr_extra = !!head + !!tail;
+
+ /* insert new subblocks */
+ memmove(p + nr_extra + 1, p + 1,
+ sizeof(chunk->map[0]) * (chunk->map_used - i));
+ chunk->map_used += nr_extra;
+
if (head) {
- i++;
- off += head;
- max_contig = max(chunk->map[i - 1], max_contig);
+ if (!seen_free) {
+ chunk->first_free = i;
+ seen_free = true;
+ }
+ *++p = off += head;
+ ++i;
+ max_contig = max(head, max_contig);
+ }
+ if (tail) {
+ p[1] = off + size;
+ max_contig = max(tail, max_contig);
}
- if (tail)
- max_contig = max(chunk->map[i + 1], max_contig);
}
+ if (!seen_free)
+ chunk->first_free = i + 1;
+
/* update hint and mark allocated */
- if (is_last)
+ if (i + 1 == chunk->map_used)
chunk->contig_hint = max_contig; /* fully scanned */
else
chunk->contig_hint = max(chunk->contig_hint,
max_contig);
- chunk->free_size -= chunk->map[i];
- chunk->map[i] = -chunk->map[i];
+ chunk->free_size -= size;
+ *p |= 1;
pcpu_chunk_relocate(chunk, oslot);
return off;
@@ -570,34 +552,50 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
{
int oslot = pcpu_chunk_slot(chunk);
- int i, off;
-
- for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
- if (off == freeme)
- break;
+ int off = 0;
+ unsigned i, j;
+ int to_free = 0;
+ int *p;
+
+ freeme |= 1; /* we are searching for <given offset, in use> pair */
+
+ i = 0;
+ j = chunk->map_used;
+ while (i != j) {
+ unsigned k = (i + j) / 2;
+ off = chunk->map[k];
+ if (off < freeme)
+ i = k + 1;
+ else if (off > freeme)
+ j = k;
+ else
+ i = j = k;
+ }
BUG_ON(off != freeme);
- BUG_ON(chunk->map[i] > 0);
- chunk->map[i] = -chunk->map[i];
- chunk->free_size += chunk->map[i];
+ if (i < chunk->first_free)
+ chunk->first_free = i;
+ p = chunk->map + i;
+ *p = off &= ~1;
+ chunk->free_size += (p[1] & ~1) - off;
+
+ /* merge with next? */
+ if (!(p[1] & 1))
+ to_free++;
/* merge with previous? */
- if (i > 0 && chunk->map[i - 1] >= 0) {
- chunk->map[i - 1] += chunk->map[i];
- chunk->map_used--;
- memmove(&chunk->map[i], &chunk->map[i + 1],
- (chunk->map_used - i) * sizeof(chunk->map[0]));
+ if (i > 0 && !(p[-1] & 1)) {
+ to_free++;
i--;
+ p--;
}
- /* merge with next? */
- if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
- chunk->map[i] += chunk->map[i + 1];
- chunk->map_used--;
- memmove(&chunk->map[i + 1], &chunk->map[i + 2],
- (chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
+ if (to_free) {
+ chunk->map_used -= to_free;
+ memmove(p + 1, p + 1 + to_free,
+ (chunk->map_used - i) * sizeof(chunk->map[0]));
}
- chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
+ chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint);
pcpu_chunk_relocate(chunk, oslot);
}
@@ -612,12 +610,14 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
sizeof(chunk->map[0]));
if (!chunk->map) {
- kfree(chunk);
+ pcpu_mem_free(chunk, pcpu_chunk_struct_size);
return NULL;
}
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
- chunk->map[chunk->map_used++] = pcpu_unit_size;
+ chunk->map[0] = 0;
+ chunk->map[1] = pcpu_unit_size | 1;
+ chunk->map_used = 1;
INIT_LIST_HEAD(&chunk->list);
chunk->free_size = pcpu_unit_size;
@@ -713,6 +713,16 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
unsigned long flags;
void __percpu *ptr;
+ /*
+ * We want the lowest bit of offset available for in-use/free
+ * indicator, so force >= 16bit alignment and make size even.
+ */
+ if (unlikely(align < 2))
+ align = 2;
+
+ if (unlikely(size & 1))
+ size++;
+
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
WARN(true, "illegal size (%zu) or align (%zu) for "
"percpu allocation\n", size, align);
@@ -1063,7 +1073,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
__alignof__(ai->groups[0].cpu_map[0]));
ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
- ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
+ ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
if (!ptr)
return NULL;
ai = ptr;
@@ -1088,7 +1098,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
*/
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
- free_bootmem(__pa(ai), ai->__ai_size);
+ memblock_free_early(__pa(ai), ai->__ai_size);
}
/**
@@ -1246,10 +1256,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
/* process group information and build config tables accordingly */
- group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
- group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
- unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
- unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
+ group_offsets = memblock_virt_alloc(ai->nr_groups *
+ sizeof(group_offsets[0]), 0);
+ group_sizes = memblock_virt_alloc(ai->nr_groups *
+ sizeof(group_sizes[0]), 0);
+ unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
+ unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = UINT_MAX;
@@ -1311,7 +1323,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
- pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
+ pcpu_slot = memblock_virt_alloc(
+ pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);
@@ -1322,7 +1335,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
* covers static area + reserved area (mostly used for module
* static percpu allocation).
*/
- schunk = alloc_bootmem(pcpu_chunk_struct_size);
+ schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&schunk->list);
schunk->base_addr = base_addr;
schunk->map = smap;
@@ -1340,13 +1353,17 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
}
schunk->contig_hint = schunk->free_size;
- schunk->map[schunk->map_used++] = -ai->static_size;
+ schunk->map[0] = 1;
+ schunk->map[1] = ai->static_size;
+ schunk->map_used = 1;
if (schunk->free_size)
- schunk->map[schunk->map_used++] = schunk->free_size;
+ schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size);
+ else
+ schunk->map[1] |= 1;
/* init dynamic chunk if necessary */
if (dyn_size) {
- dchunk = alloc_bootmem(pcpu_chunk_struct_size);
+ dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&dchunk->list);
dchunk->base_addr = base_addr;
dchunk->map = dmap;
@@ -1355,8 +1372,10 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
bitmap_fill(dchunk->populated, pcpu_unit_pages);
dchunk->contig_hint = dchunk->free_size = dyn_size;
- dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
- dchunk->map[dchunk->map_used++] = dchunk->free_size;
+ dchunk->map[0] = 1;
+ dchunk->map[1] = pcpu_reserved_chunk_limit;
+ dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
+ dchunk->map_used = 2;
}
/* link the first chunk in */
@@ -1626,7 +1645,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
- areas = alloc_bootmem_nopanic(areas_size);
+ areas = memblock_virt_alloc_nopanic(areas_size, 0);
if (!areas) {
rc = -ENOMEM;
goto out_free;
@@ -1686,10 +1705,10 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
max_distance += ai->unit_size;
/* warn if maximum distance is further than 75% of vmalloc space */
- if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
+ if (max_distance > VMALLOC_TOTAL * 3 / 4) {
pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
"space 0x%lx\n", max_distance,
- (unsigned long)(VMALLOC_END - VMALLOC_START));
+ VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
/* and fail if we have fallback */
rc = -EINVAL;
@@ -1712,7 +1731,7 @@ out_free_areas:
out_free:
pcpu_free_alloc_info(ai);
if (areas)
- free_bootmem(__pa(areas), areas_size);
+ memblock_free_early(__pa(areas), areas_size);
return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */
@@ -1760,7 +1779,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
sizeof(pages[0]));
- pages = alloc_bootmem(pages_size);
+ pages = memblock_virt_alloc(pages_size, 0);
/* allocate pages */
j = 0;
@@ -1823,7 +1842,7 @@ enomem:
free_fn(page_address(pages[j]), PAGE_SIZE);
rc = -ENOMEM;
out_free_ar:
- free_bootmem(__pa(pages), pages_size);
+ memblock_free_early(__pa(pages), pages_size);
pcpu_free_alloc_info(ai);
return rc;
}
@@ -1848,12 +1867,13 @@ EXPORT_SYMBOL(__per_cpu_offset);
static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
size_t align)
{
- return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
+ return memblock_virt_alloc_from_nopanic(
+ size, align, __pa(MAX_DMA_ADDRESS));
}
static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
{
- free_bootmem(__pa(ptr), size);
+ memblock_free_early(__pa(ptr), size);
}
void __init setup_per_cpu_areas(void)
@@ -1896,7 +1916,9 @@ void __init setup_per_cpu_areas(void)
void *fc;
ai = pcpu_alloc_alloc_info(1, 1);
- fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ fc = memblock_virt_alloc_from_nopanic(unit_size,
+ PAGE_SIZE,
+ __pa(MAX_DMA_ADDRESS));
if (!ai || !fc)
panic("Failed to allocate memory for percpu areas.");
/* kmemleak tracks the percpu allocations separately */
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index cbb38545d9d..a8b91992593 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -110,9 +110,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep)
{
+ struct mm_struct *mm = (vma)->vm_mm;
pte_t pte;
- pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
- if (pte_accessible(pte))
+ pte = ptep_get_and_clear(mm, address, ptep);
+ if (pte_accessible(mm, pte))
flush_tlb_page(vma, address);
return pte;
}
@@ -191,6 +192,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ pmd_t entry = *pmdp;
+ if (pmd_numa(entry))
+ entry = pmd_mknonnuma(entry);
set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
}
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index fd26d043350..5077afcd9e1 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -23,129 +23,40 @@
/**
* process_vm_rw_pages - read/write pages from task specified
- * @task: task to read/write from
- * @mm: mm for task
- * @process_pages: struct pages area that can store at least
- * nr_pages_to_copy struct page pointers
- * @pa: address of page in task to start copying from/to
+ * @pages: array of pointers to pages we want to copy
* @start_offset: offset in page to start copying from/to
* @len: number of bytes to copy
- * @lvec: iovec array specifying where to copy to/from
- * @lvec_cnt: number of elements in iovec array
- * @lvec_current: index in iovec array we are up to
- * @lvec_offset: offset in bytes from current iovec iov_base we are up to
+ * @iter: where to copy to/from locally
* @vm_write: 0 means copy from, 1 means copy to
- * @nr_pages_to_copy: number of pages to copy
- * @bytes_copied: returns number of bytes successfully copied
* Returns 0 on success, error code otherwise
*/
-static int process_vm_rw_pages(struct task_struct *task,
- struct mm_struct *mm,
- struct page **process_pages,
- unsigned long pa,
- unsigned long start_offset,
- unsigned long len,
- const struct iovec *lvec,
- unsigned long lvec_cnt,
- unsigned long *lvec_current,
- size_t *lvec_offset,
- int vm_write,
- unsigned int nr_pages_to_copy,
- ssize_t *bytes_copied)
+static int process_vm_rw_pages(struct page **pages,
+ unsigned offset,
+ size_t len,
+ struct iov_iter *iter,
+ int vm_write)
{
- int pages_pinned;
- void *target_kaddr;
- int pgs_copied = 0;
- int j;
- int ret;
- ssize_t bytes_to_copy;
- ssize_t rc = 0;
-
- *bytes_copied = 0;
-
- /* Get the pages we're interested in */
- down_read(&mm->mmap_sem);
- pages_pinned = get_user_pages(task, mm, pa,
- nr_pages_to_copy,
- vm_write, 0, process_pages, NULL);
- up_read(&mm->mmap_sem);
-
- if (pages_pinned != nr_pages_to_copy) {
- rc = -EFAULT;
- goto end;
- }
-
/* Do the copy for each page */
- for (pgs_copied = 0;
- (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt);
- pgs_copied++) {
- /* Make sure we have a non zero length iovec */
- while (*lvec_current < lvec_cnt
- && lvec[*lvec_current].iov_len == 0)
- (*lvec_current)++;
- if (*lvec_current == lvec_cnt)
- break;
+ while (len && iov_iter_count(iter)) {
+ struct page *page = *pages++;
+ size_t copy = PAGE_SIZE - offset;
+ size_t copied;
- /*
- * Will copy smallest of:
- * - bytes remaining in page
- * - bytes remaining in destination iovec
- */
- bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset,
- len - *bytes_copied);
- bytes_to_copy = min_t(ssize_t, bytes_to_copy,
- lvec[*lvec_current].iov_len
- - *lvec_offset);
-
- target_kaddr = kmap(process_pages[pgs_copied]) + start_offset;
-
- if (vm_write)
- ret = copy_from_user(target_kaddr,
- lvec[*lvec_current].iov_base
- + *lvec_offset,
- bytes_to_copy);
- else
- ret = copy_to_user(lvec[*lvec_current].iov_base
- + *lvec_offset,
- target_kaddr, bytes_to_copy);
- kunmap(process_pages[pgs_copied]);
- if (ret) {
- *bytes_copied += bytes_to_copy - ret;
- pgs_copied++;
- rc = -EFAULT;
- goto end;
- }
- *bytes_copied += bytes_to_copy;
- *lvec_offset += bytes_to_copy;
- if (*lvec_offset == lvec[*lvec_current].iov_len) {
- /*
- * Need to copy remaining part of page into the
- * next iovec if there are any bytes left in page
- */
- (*lvec_current)++;
- *lvec_offset = 0;
- start_offset = (start_offset + bytes_to_copy)
- % PAGE_SIZE;
- if (start_offset)
- pgs_copied--;
- } else {
- start_offset = 0;
- }
- }
+ if (copy > len)
+ copy = len;
-end:
- if (vm_write) {
- for (j = 0; j < pages_pinned; j++) {
- if (j < pgs_copied)
- set_page_dirty_lock(process_pages[j]);
- put_page(process_pages[j]);
+ if (vm_write) {
+ copied = copy_page_from_iter(page, offset, copy, iter);
+ set_page_dirty_lock(page);
+ } else {
+ copied = copy_page_to_iter(page, offset, copy, iter);
}
- } else {
- for (j = 0; j < pages_pinned; j++)
- put_page(process_pages[j]);
+ len -= copied;
+ if (copied < copy && iov_iter_count(iter))
+ return -EFAULT;
+ offset = 0;
}
-
- return rc;
+ return 0;
}
/* Maximum number of pages kmalloc'd to hold struct page's during copy */
@@ -155,67 +66,60 @@ end:
* process_vm_rw_single_vec - read/write pages from task specified
* @addr: start memory address of target process
* @len: size of area to copy to/from
- * @lvec: iovec array specifying where to copy to/from locally
- * @lvec_cnt: number of elements in iovec array
- * @lvec_current: index in iovec array we are up to
- * @lvec_offset: offset in bytes from current iovec iov_base we are up to
+ * @iter: where to copy to/from locally
* @process_pages: struct pages area that can store at least
* nr_pages_to_copy struct page pointers
* @mm: mm for task
* @task: task to read/write from
* @vm_write: 0 means copy from, 1 means copy to
- * @bytes_copied: returns number of bytes successfully copied
* Returns 0 on success or on failure error code
*/
static int process_vm_rw_single_vec(unsigned long addr,
unsigned long len,
- const struct iovec *lvec,
- unsigned long lvec_cnt,
- unsigned long *lvec_current,
- size_t *lvec_offset,
+ struct iov_iter *iter,
struct page **process_pages,
struct mm_struct *mm,
struct task_struct *task,
- int vm_write,
- ssize_t *bytes_copied)
+ int vm_write)
{
unsigned long pa = addr & PAGE_MASK;
unsigned long start_offset = addr - pa;
unsigned long nr_pages;
- ssize_t bytes_copied_loop;
ssize_t rc = 0;
- unsigned long nr_pages_copied = 0;
- unsigned long nr_pages_to_copy;
unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
/ sizeof(struct pages *);
- *bytes_copied = 0;
-
/* Work out address and page range required */
if (len == 0)
return 0;
nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
- while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) {
- nr_pages_to_copy = min(nr_pages - nr_pages_copied,
- max_pages_per_loop);
+ while (!rc && nr_pages && iov_iter_count(iter)) {
+ int pages = min(nr_pages, max_pages_per_loop);
+ size_t bytes;
- rc = process_vm_rw_pages(task, mm, process_pages, pa,
- start_offset, len,
- lvec, lvec_cnt,
- lvec_current, lvec_offset,
- vm_write, nr_pages_to_copy,
- &bytes_copied_loop);
- start_offset = 0;
- *bytes_copied += bytes_copied_loop;
+ /* Get the pages we're interested in */
+ down_read(&mm->mmap_sem);
+ pages = get_user_pages(task, mm, pa, pages,
+ vm_write, 0, process_pages, NULL);
+ up_read(&mm->mmap_sem);
- if (rc < 0) {
- return rc;
- } else {
- len -= bytes_copied_loop;
- nr_pages_copied += nr_pages_to_copy;
- pa += nr_pages_to_copy * PAGE_SIZE;
- }
+ if (pages <= 0)
+ return -EFAULT;
+
+ bytes = pages * PAGE_SIZE - start_offset;
+ if (bytes > len)
+ bytes = len;
+
+ rc = process_vm_rw_pages(process_pages,
+ start_offset, bytes, iter,
+ vm_write);
+ len -= bytes;
+ start_offset = 0;
+ nr_pages -= pages;
+ pa += pages * PAGE_SIZE;
+ while (pages)
+ put_page(process_pages[--pages]);
}
return rc;
@@ -228,8 +132,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
/**
* process_vm_rw_core - core of reading/writing pages from task specified
* @pid: PID of process to read/write from/to
- * @lvec: iovec array specifying where to copy to/from locally
- * @liovcnt: size of lvec array
+ * @iter: where to copy to/from locally
* @rvec: iovec array specifying where to copy to/from in the other process
* @riovcnt: size of rvec array
* @flags: currently unused
@@ -238,8 +141,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
* return less bytes than expected if an error occurs during the copying
* process.
*/
-static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
- unsigned long liovcnt,
+static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
const struct iovec *rvec,
unsigned long riovcnt,
unsigned long flags, int vm_write)
@@ -250,13 +152,10 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
struct mm_struct *mm;
unsigned long i;
ssize_t rc = 0;
- ssize_t bytes_copied_loop;
- ssize_t bytes_copied = 0;
unsigned long nr_pages = 0;
unsigned long nr_pages_iov;
- unsigned long iov_l_curr_idx = 0;
- size_t iov_l_curr_offset = 0;
ssize_t iov_len;
+ size_t total_len = iov_iter_count(iter);
/*
* Work out how many pages of struct pages we're going to need
@@ -310,24 +209,20 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
goto put_task_struct;
}
- for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
+ for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++)
rc = process_vm_rw_single_vec(
(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
- lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset,
- process_pages, mm, task, vm_write, &bytes_copied_loop);
- bytes_copied += bytes_copied_loop;
- if (rc != 0) {
- /* If we have managed to copy any data at all then
- we return the number of bytes copied. Otherwise
- we return the error code */
- if (bytes_copied)
- rc = bytes_copied;
- goto put_mm;
- }
- }
+ iter, process_pages, mm, task, vm_write);
+
+ /* copied = space before - space after */
+ total_len -= iov_iter_count(iter);
+
+ /* If we have managed to copy any data at all then
+ we return the number of bytes copied. Otherwise
+ we return the error code */
+ if (total_len)
+ rc = total_len;
- rc = bytes_copied;
-put_mm:
mmput(mm);
put_task_struct:
@@ -363,6 +258,7 @@ static ssize_t process_vm_rw(pid_t pid,
struct iovec iovstack_r[UIO_FASTIOV];
struct iovec *iov_l = iovstack_l;
struct iovec *iov_r = iovstack_r;
+ struct iov_iter iter;
ssize_t rc;
if (flags != 0)
@@ -378,13 +274,14 @@ static ssize_t process_vm_rw(pid_t pid,
if (rc <= 0)
goto free_iovecs;
+ iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
+
rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
iovstack_r, &iov_r);
if (rc <= 0)
goto free_iovecs;
- rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
- vm_write);
+ rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
free_iovecs:
if (iov_r != iovstack_r)
@@ -412,7 +309,7 @@ SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
#ifdef CONFIG_COMPAT
-asmlinkage ssize_t
+static ssize_t
compat_process_vm_rw(compat_pid_t pid,
const struct compat_iovec __user *lvec,
unsigned long liovcnt,
@@ -424,6 +321,7 @@ compat_process_vm_rw(compat_pid_t pid,
struct iovec iovstack_r[UIO_FASTIOV];
struct iovec *iov_l = iovstack_l;
struct iovec *iov_r = iovstack_r;
+ struct iov_iter iter;
ssize_t rc = -EFAULT;
if (flags != 0)
@@ -439,14 +337,14 @@ compat_process_vm_rw(compat_pid_t pid,
&iov_l);
if (rc <= 0)
goto free_iovecs;
+ iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
UIO_FASTIOV, iovstack_r,
&iov_r);
if (rc <= 0)
goto free_iovecs;
- rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
- vm_write);
+ rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
free_iovecs:
if (iov_r != iovstack_r)
@@ -456,25 +354,23 @@ free_iovecs:
return rc;
}
-asmlinkage ssize_t
-compat_sys_process_vm_readv(compat_pid_t pid,
- const struct compat_iovec __user *lvec,
- unsigned long liovcnt,
- const struct compat_iovec __user *rvec,
- unsigned long riovcnt,
- unsigned long flags)
+COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid,
+ const struct compat_iovec __user *, lvec,
+ compat_ulong_t, liovcnt,
+ const struct compat_iovec __user *, rvec,
+ compat_ulong_t, riovcnt,
+ compat_ulong_t, flags)
{
return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
riovcnt, flags, 0);
}
-asmlinkage ssize_t
-compat_sys_process_vm_writev(compat_pid_t pid,
- const struct compat_iovec __user *lvec,
- unsigned long liovcnt,
- const struct compat_iovec __user *rvec,
- unsigned long riovcnt,
- unsigned long flags)
+COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid,
+ const struct compat_iovec __user *, lvec,
+ compat_ulong_t, liovcnt,
+ const struct compat_iovec __user *, rvec,
+ compat_ulong_t, riovcnt,
+ compat_ulong_t, flags)
{
return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
riovcnt, flags, 1);
diff --git a/mm/readahead.c b/mm/readahead.c
index 7cdbb44aa90..0ca36a7770b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -8,9 +8,7 @@
*/
#include <linux/kernel.h>
-#include <linux/fs.h>
#include <linux/gfp.h>
-#include <linux/mm.h>
#include <linux/export.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
@@ -20,6 +18,8 @@
#include <linux/syscalls.h>
#include <linux/file.h>
+#include "internal.h"
+
/*
* Initialise a struct file's readahead state. Assumes that the caller has
* memset *ra to zero.
@@ -149,8 +149,7 @@ out:
*
* Returns the number of pages requested, or the maximum amount of I/O allowed.
*/
-static int
-__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size)
{
@@ -179,7 +178,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, page_offset);
rcu_read_unlock();
- if (page)
+ if (page && !radix_tree_exceptional_entry(page))
continue;
page = page_cache_alloc_readahead(mapping);
@@ -211,8 +210,6 @@ out:
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read)
{
- int ret = 0;
-
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
return -EINVAL;
@@ -226,39 +223,23 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
this_chunk = nr_to_read;
err = __do_page_cache_readahead(mapping, filp,
offset, this_chunk, 0);
- if (err < 0) {
- ret = err;
- break;
- }
- ret += err;
+ if (err < 0)
+ return err;
+
offset += this_chunk;
nr_to_read -= this_chunk;
}
- return ret;
+ return 0;
}
+#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE)
/*
* Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
* sensible upper limit.
*/
unsigned long max_sane_readahead(unsigned long nr)
{
- return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
- + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
-}
-
-/*
- * Submit IO for the read-ahead request in file_ra_state.
- */
-unsigned long ra_submit(struct file_ra_state *ra,
- struct address_space *mapping, struct file *filp)
-{
- int actual;
-
- actual = __do_page_cache_readahead(mapping, filp,
- ra->start, ra->size, ra->async_size);
-
- return actual;
+ return min(nr, MAX_READAHEAD);
}
/*
@@ -351,7 +332,7 @@ static pgoff_t count_history_pages(struct address_space *mapping,
pgoff_t head;
rcu_read_lock();
- head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max);
+ head = page_cache_prev_hole(mapping, offset - 1, max);
rcu_read_unlock();
return offset - 1 - head;
@@ -431,7 +412,7 @@ ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
- start = radix_tree_next_hole(&mapping->page_tree, offset+1,max);
+ start = page_cache_next_hole(mapping, offset + 1, max);
rcu_read_unlock();
if (!start || start - offset > max)
@@ -576,8 +557,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
if (!mapping || !mapping->a_ops)
return -EINVAL;
- force_page_cache_readahead(mapping, filp, index, nr);
- return 0;
+ return force_page_cache_readahead(mapping, filp, index, nr);
}
SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
diff --git a/mm/rmap.c b/mm/rmap.c
index 55c8b8dc9ff..22a4a7699cd 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,6 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
* LOCK should suffice since the actual taking of the lock must
* happen _before_ what follows.
*/
+ might_sleep();
if (rwsem_is_locked(&anon_vma->root->rwsem)) {
anon_vma_lock_write(anon_vma);
anon_vma_unlock_write(anon_vma);
@@ -426,8 +427,9 @@ struct anon_vma *page_get_anon_vma(struct page *page)
* above cannot corrupt).
*/
if (!page_mapped(page)) {
+ rcu_read_unlock();
put_anon_vma(anon_vma);
- anon_vma = NULL;
+ return NULL;
}
out:
rcu_read_unlock();
@@ -477,9 +479,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
}
if (!page_mapped(page)) {
+ rcu_read_unlock();
put_anon_vma(anon_vma);
- anon_vma = NULL;
- goto out;
+ return NULL;
}
/* we pinned the anon_vma, its safe to sleep */
@@ -515,11 +517,7 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
static inline unsigned long
__vma_address(struct page *page, struct vm_area_struct *vma)
{
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-
- if (unlikely(is_vm_hugetlb_page(vma)))
- pgoff = page->index << huge_page_order(page_hstate(page));
-
+ pgoff_t pgoff = page_to_pgoff(page);
return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
}
@@ -567,6 +565,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd = NULL;
+ pmd_t pmde;
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -577,7 +576,13 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
goto out;
pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
+ /*
+ * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
+ * without holding anon_vma lock for write. So when looking for a
+ * genuine pmde (in which to find pte), test present and !THP together.
+ */
+ pmde = ACCESS_ONCE(*pmd);
+ if (!pmd_present(pmde) || pmd_trans_huge(pmde))
pmd = NULL;
out:
return pmd;
@@ -600,7 +605,11 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
spinlock_t *ptl;
if (unlikely(PageHuge(page))) {
+ /* when pud is not present, pte will be NULL */
pte = huge_pte_offset(mm, address);
+ if (!pte)
+ return NULL;
+
ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
goto check;
}
@@ -609,9 +618,6 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
if (!pmd)
return NULL;
- if (pmd_trans_huge(*pmd))
- return NULL;
-
pte = pte_offset_map(pmd, address);
/* Make a quick check before getting the lock */
if (!sync && !pte_present(*pte)) {
@@ -656,17 +662,22 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
return 1;
}
+struct page_referenced_arg {
+ int mapcount;
+ int referenced;
+ unsigned long vm_flags;
+ struct mem_cgroup *memcg;
+};
/*
- * Subfunctions of page_referenced: page_referenced_one called
- * repeatedly from either page_referenced_anon or page_referenced_file.
+ * arg: page_referenced_arg will be passed
*/
-int page_referenced_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, unsigned int *mapcount,
- unsigned long *vm_flags)
+static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
int referenced = 0;
+ struct page_referenced_arg *pra = arg;
if (unlikely(PageTransHuge(page))) {
pmd_t *pmd;
@@ -678,13 +689,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
if (!pmd)
- goto out;
+ return SWAP_AGAIN;
if (vma->vm_flags & VM_LOCKED) {
spin_unlock(ptl);
- *mapcount = 0; /* break early from loop */
- *vm_flags |= VM_LOCKED;
- goto out;
+ pra->vm_flags |= VM_LOCKED;
+ return SWAP_FAIL; /* To break the loop */
}
/* go ahead even if the pmd is pmd_trans_splitting() */
@@ -700,13 +710,12 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
*/
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
- goto out;
+ return SWAP_AGAIN;
if (vma->vm_flags & VM_LOCKED) {
pte_unmap_unlock(pte, ptl);
- *mapcount = 0; /* break early from loop */
- *vm_flags |= VM_LOCKED;
- goto out;
+ pra->vm_flags |= VM_LOCKED;
+ return SWAP_FAIL; /* To break the loop */
}
if (ptep_clear_flush_young_notify(vma, address, pte)) {
@@ -723,113 +732,27 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
}
- (*mapcount)--;
-
- if (referenced)
- *vm_flags |= vma->vm_flags;
-out:
- return referenced;
-}
-
-static int page_referenced_anon(struct page *page,
- struct mem_cgroup *memcg,
- unsigned long *vm_flags)
-{
- unsigned int mapcount;
- struct anon_vma *anon_vma;
- pgoff_t pgoff;
- struct anon_vma_chain *avc;
- int referenced = 0;
-
- anon_vma = page_lock_anon_vma_read(page);
- if (!anon_vma)
- return referenced;
-
- mapcount = page_mapcount(page);
- pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long address = vma_address(page, vma);
- /*
- * If we are reclaiming on behalf of a cgroup, skip
- * counting on behalf of references from different
- * cgroups
- */
- if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
- continue;
- referenced += page_referenced_one(page, vma, address,
- &mapcount, vm_flags);
- if (!mapcount)
- break;
+ if (referenced) {
+ pra->referenced++;
+ pra->vm_flags |= vma->vm_flags;
}
- page_unlock_anon_vma_read(anon_vma);
- return referenced;
+ pra->mapcount--;
+ if (!pra->mapcount)
+ return SWAP_SUCCESS; /* To break the loop */
+
+ return SWAP_AGAIN;
}
-/**
- * page_referenced_file - referenced check for object-based rmap
- * @page: the page we're checking references on.
- * @memcg: target memory control group
- * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
- *
- * For an object-based mapped page, find all the places it is mapped and
- * check/clear the referenced flag. This is done by following the page->mapping
- * pointer, then walking the chain of vmas it holds. It returns the number
- * of references it found.
- *
- * This function is only called from page_referenced for object-based pages.
- */
-static int page_referenced_file(struct page *page,
- struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
{
- unsigned int mapcount;
- struct address_space *mapping = page->mapping;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct vm_area_struct *vma;
- int referenced = 0;
-
- /*
- * The caller's checks on page->mapping and !PageAnon have made
- * sure that this is a file page: the check for page->mapping
- * excludes the case just before it gets set on an anon page.
- */
- BUG_ON(PageAnon(page));
-
- /*
- * The page lock not only makes sure that page->mapping cannot
- * suddenly be NULLified by truncation, it makes sure that the
- * structure at mapping cannot be freed and reused yet,
- * so we can safely take mapping->i_mmap_mutex.
- */
- BUG_ON(!PageLocked(page));
-
- mutex_lock(&mapping->i_mmap_mutex);
-
- /*
- * i_mmap_mutex does not stabilize mapcount at all, but mapcount
- * is more likely to be accurate if we note it after spinning.
- */
- mapcount = page_mapcount(page);
+ struct page_referenced_arg *pra = arg;
+ struct mem_cgroup *memcg = pra->memcg;
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- unsigned long address = vma_address(page, vma);
- /*
- * If we are reclaiming on behalf of a cgroup, skip
- * counting on behalf of references from different
- * cgroups
- */
- if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
- continue;
- referenced += page_referenced_one(page, vma, address,
- &mapcount, vm_flags);
- if (!mapcount)
- break;
- }
+ if (!mm_match_cgroup(vma->vm_mm, memcg))
+ return true;
- mutex_unlock(&mapping->i_mmap_mutex);
- return referenced;
+ return false;
}
/**
@@ -847,41 +770,57 @@ int page_referenced(struct page *page,
struct mem_cgroup *memcg,
unsigned long *vm_flags)
{
- int referenced = 0;
+ int ret;
int we_locked = 0;
+ struct page_referenced_arg pra = {
+ .mapcount = page_mapcount(page),
+ .memcg = memcg,
+ };
+ struct rmap_walk_control rwc = {
+ .rmap_one = page_referenced_one,
+ .arg = (void *)&pra,
+ .anon_lock = page_lock_anon_vma_read,
+ };
*vm_flags = 0;
- if (page_mapped(page) && page_rmapping(page)) {
- if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
- we_locked = trylock_page(page);
- if (!we_locked) {
- referenced++;
- goto out;
- }
- }
- if (unlikely(PageKsm(page)))
- referenced += page_referenced_ksm(page, memcg,
- vm_flags);
- else if (PageAnon(page))
- referenced += page_referenced_anon(page, memcg,
- vm_flags);
- else if (page->mapping)
- referenced += page_referenced_file(page, memcg,
- vm_flags);
- if (we_locked)
- unlock_page(page);
+ if (!page_mapped(page))
+ return 0;
+
+ if (!page_rmapping(page))
+ return 0;
+
+ if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
+ we_locked = trylock_page(page);
+ if (!we_locked)
+ return 1;
}
-out:
- return referenced;
+
+ /*
+ * If we are reclaiming on behalf of a cgroup, skip
+ * counting on behalf of references from different
+ * cgroups
+ */
+ if (memcg) {
+ rwc.invalid_vma = invalid_page_referenced_vma;
+ }
+
+ ret = rmap_walk(page, &rwc);
+ *vm_flags = pra.vm_flags;
+
+ if (we_locked)
+ unlock_page(page);
+
+ return pra.referenced;
}
static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address)
+ unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte;
spinlock_t *ptl;
int ret = 0;
+ int *cleaned = arg;
pte = page_check_address(page, mm, address, &ptl, 1);
if (!pte)
@@ -900,44 +839,44 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
- if (ret)
+ if (ret) {
mmu_notifier_invalidate_page(mm, address);
+ (*cleaned)++;
+ }
out:
- return ret;
+ return SWAP_AGAIN;
}
-static int page_mkclean_file(struct address_space *mapping, struct page *page)
+static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
{
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct vm_area_struct *vma;
- int ret = 0;
-
- BUG_ON(PageAnon(page));
+ if (vma->vm_flags & VM_SHARED)
+ return false;
- mutex_lock(&mapping->i_mmap_mutex);
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- if (vma->vm_flags & VM_SHARED) {
- unsigned long address = vma_address(page, vma);
- ret += page_mkclean_one(page, vma, address);
- }
- }
- mutex_unlock(&mapping->i_mmap_mutex);
- return ret;
+ return true;
}
int page_mkclean(struct page *page)
{
- int ret = 0;
+ int cleaned = 0;
+ struct address_space *mapping;
+ struct rmap_walk_control rwc = {
+ .arg = (void *)&cleaned,
+ .rmap_one = page_mkclean_one,
+ .invalid_vma = invalid_mkclean_vma,
+ };
BUG_ON(!PageLocked(page));
- if (page_mapped(page)) {
- struct address_space *mapping = page_mapping(page);
- if (mapping)
- ret = page_mkclean_file(mapping, page);
- }
+ if (!page_mapped(page))
+ return 0;
- return ret;
+ mapping = page_mapping(page);
+ if (!mapping)
+ return 0;
+
+ rmap_walk(page, &rwc);
+
+ return cleaned;
}
EXPORT_SYMBOL_GPL(page_mkclean);
@@ -957,9 +896,9 @@ void page_move_anon_rmap(struct page *page,
{
struct anon_vma *anon_vma = vma->anon_vma;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON(!anon_vma);
- VM_BUG_ON(page->index != linear_page_index(vma, address));
+ VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
@@ -1049,6 +988,12 @@ void do_page_add_anon_rmap(struct page *page,
{
int first = atomic_inc_and_test(&page->_mapcount);
if (first) {
+ /*
+ * We use the irq-unsafe __{inc|mod}_zone_page_stat because
+ * these counters are not modified in interrupt context, and
+ * pte lock(a spinlock) is held, which implies preemption
+ * disabled.
+ */
if (PageTransHuge(page))
__inc_zone_page_state(page,
NR_ANON_TRANSPARENT_HUGEPAGES);
@@ -1058,7 +1003,7 @@ void do_page_add_anon_rmap(struct page *page,
if (unlikely(PageKsm(page)))
return;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
/* address might be in next vma when migration races vma_adjust */
if (first)
__page_set_anon_rmap(page, vma, address, exclusive);
@@ -1087,11 +1032,25 @@ void page_add_new_anon_rmap(struct page *page,
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
hpage_nr_pages(page));
__page_set_anon_rmap(page, vma, address, 1);
- if (!mlocked_vma_newpage(vma, page)) {
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
SetPageActive(page);
lru_cache_add(page);
- } else
- add_page_to_unevictable_list(page);
+ return;
+ }
+
+ if (!TestSetPageMlocked(page)) {
+ /*
+ * We use the irq-unsafe __mod_zone_page_stat because this
+ * counter is not modified from interrupt context, and the pte
+ * lock is held(spinlock), which implies preemption disabled.
+ */
+ __mod_zone_page_state(page_zone(page), NR_MLOCK,
+ hpage_nr_pages(page));
+ count_vm_event(UNEVICTABLE_PGMLOCKED);
+ }
+ add_page_to_unevictable_list(page);
}
/**
@@ -1140,6 +1099,11 @@ void page_remove_rmap(struct page *page)
/*
* Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
* and not charged by memcg for now.
+ *
+ * We use the irq-unsafe __{inc|mod}_zone_page_stat because
+ * these counters are not modified in interrupt context, and
+ * these counters are not modified in interrupt context, and
+ * pte lock(a spinlock) is held, which implies preemption disabled.
*/
if (unlikely(PageHuge(page)))
goto out;
@@ -1173,17 +1137,17 @@ out:
}
/*
- * Subfunctions of try_to_unmap: try_to_unmap_one called
- * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
+ * @arg: enum ttu_flags will be passed to this argument
*/
-int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, enum ttu_flags flags)
+static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte;
pte_t pteval;
spinlock_t *ptl;
int ret = SWAP_AGAIN;
+ enum ttu_flags flags = (enum ttu_flags)arg;
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
@@ -1198,7 +1162,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (vma->vm_flags & VM_LOCKED)
goto out_mlock;
- if (TTU_ACTION(flags) == TTU_MUNLOCK)
+ if (flags & TTU_MUNLOCK)
goto out_unmap;
}
if (!(flags & TTU_IGNORE_ACCESS)) {
@@ -1228,6 +1192,16 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
set_pte_at(mm, address, pte,
swp_entry_to_pte(make_hwpoison_entry(page)));
+ } else if (pte_unused(pteval)) {
+ /*
+ * The guest indicated that the page content is of no
+ * interest anymore. Simply discard the pte, vmscan
+ * will take care of the rest.
+ */
+ if (PageAnon(page))
+ dec_mm_counter(mm, MM_ANONPAGES);
+ else
+ dec_mm_counter(mm, MM_FILEPAGES);
} else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(page) };
pte_t swp_pte;
@@ -1256,7 +1230,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
- BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
+ BUG_ON(!(flags & TTU_MIGRATION));
entry = make_migration_entry(page, pte_write(pteval));
}
swp_pte = swp_entry_to_pte(entry);
@@ -1265,7 +1239,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
set_pte_at(mm, address, pte, swp_pte);
BUG_ON(pte_file(*pte));
} else if (IS_ENABLED(CONFIG_MIGRATION) &&
- (TTU_ACTION(flags) == TTU_MIGRATION)) {
+ (flags & TTU_MIGRATION)) {
/* Establish migration entry for a file page */
swp_entry_t entry;
entry = make_migration_entry(page, pte_write(pteval));
@@ -1278,7 +1252,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret != SWAP_FAIL)
+ if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK))
mmu_notifier_invalidate_page(mm, address);
out:
return ret;
@@ -1385,9 +1359,19 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
BUG_ON(!page || PageAnon(page));
if (locked_vma) {
- mlock_vma_page(page); /* no-op if already mlocked */
- if (page == check_page)
+ if (page == check_page) {
+ /* we know we have check_page locked */
+ mlock_vma_page(page);
ret = SWAP_MLOCK;
+ } else if (trylock_page(page)) {
+ /*
+ * If we can lock the page, perform mlock.
+ * Otherwise leave the page alone, it will be
+ * eventually encountered again later.
+ */
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
continue; /* don't unmap */
}
@@ -1402,7 +1386,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
if (page->index != linear_page_index(vma, address)) {
pte_t ptfile = pgoff_to_pte(page->index);
if (pte_soft_dirty(pteval))
- pte_file_mksoft_dirty(ptfile);
+ ptfile = pte_file_mksoft_dirty(ptfile);
set_pte_at(mm, address, pte, ptfile);
}
@@ -1422,93 +1406,9 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
return ret;
}
-bool is_vma_temporary_stack(struct vm_area_struct *vma)
-{
- int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
-
- if (!maybe_stack)
- return false;
-
- if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
- VM_STACK_INCOMPLETE_SETUP)
- return true;
-
- return false;
-}
-
-/**
- * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
- * rmap method
- * @page: the page to unmap/unlock
- * @flags: action and flags
- *
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the anon_vma struct it points to.
- *
- * This function is only called from try_to_unmap/try_to_munlock for
- * anonymous pages.
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
- * where the page was found will be held for write. So, we won't recheck
- * vm_flags for that VMA. That should be OK, because that vma shouldn't be
- * 'LOCKED.
- */
-static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
-{
- struct anon_vma *anon_vma;
- pgoff_t pgoff;
- struct anon_vma_chain *avc;
- int ret = SWAP_AGAIN;
-
- anon_vma = page_lock_anon_vma_read(page);
- if (!anon_vma)
- return ret;
-
- pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long address;
-
- /*
- * During exec, a temporary VMA is setup and later moved.
- * The VMA is moved under the anon_vma lock but not the
- * page tables leading to a race where migration cannot
- * find the migration ptes. Rather than increasing the
- * locking requirements of exec(), migration skips
- * temporary VMAs until after exec() completes.
- */
- if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
- is_vma_temporary_stack(vma))
- continue;
-
- address = vma_address(page, vma);
- ret = try_to_unmap_one(page, vma, address, flags);
- if (ret != SWAP_AGAIN || !page_mapped(page))
- break;
- }
-
- page_unlock_anon_vma_read(anon_vma);
- return ret;
-}
-
-/**
- * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
- * @page: the page to unmap/unlock
- * @flags: action and flags
- *
- * Find all the mappings of a page using the mapping pointer and the vma chains
- * contained in the address_space struct it points to.
- *
- * This function is only called from try_to_unmap/try_to_munlock for
- * object-based pages.
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
- * where the page was found will be held for write. So, we won't recheck
- * vm_flags for that VMA. That should be OK, because that vma shouldn't be
- * 'LOCKED.
- */
-static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
+static int try_to_unmap_nonlinear(struct page *page,
+ struct address_space *mapping, void *arg)
{
- struct address_space *mapping = page->mapping;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
unsigned long cursor;
@@ -1516,30 +1416,9 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
unsigned long max_nl_size = 0;
unsigned int mapcount;
- if (PageHuge(page))
- pgoff = page->index << compound_order(page);
-
- mutex_lock(&mapping->i_mmap_mutex);
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- unsigned long address = vma_address(page, vma);
- ret = try_to_unmap_one(page, vma, address, flags);
- if (ret != SWAP_AGAIN || !page_mapped(page))
- goto out;
- }
-
- if (list_empty(&mapping->i_mmap_nonlinear))
- goto out;
+ list_for_each_entry(vma,
+ &mapping->i_mmap_nonlinear, shared.nonlinear) {
- /*
- * We don't bother to try to find the munlocked page in nonlinears.
- * It's costly. Instead, later, page reclaim logic may call
- * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
- */
- if (TTU_ACTION(flags) == TTU_MUNLOCK)
- goto out;
-
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.nonlinear) {
cursor = (unsigned long) vma->vm_private_data;
if (cursor > max_nl_cursor)
max_nl_cursor = cursor;
@@ -1549,8 +1428,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
}
if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
- ret = SWAP_FAIL;
- goto out;
+ return SWAP_FAIL;
}
/*
@@ -1562,7 +1440,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
*/
mapcount = page_mapcount(page);
if (!mapcount)
- goto out;
+ return ret;
+
cond_resched();
max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
@@ -1570,10 +1449,11 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
max_nl_cursor = CLUSTER_SIZE;
do {
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.nonlinear) {
+ list_for_each_entry(vma,
+ &mapping->i_mmap_nonlinear, shared.nonlinear) {
+
cursor = (unsigned long) vma->vm_private_data;
- while ( cursor < max_nl_cursor &&
+ while (cursor < max_nl_cursor &&
cursor < vma->vm_end - vma->vm_start) {
if (try_to_unmap_cluster(cursor, &mapcount,
vma, page) == SWAP_MLOCK)
@@ -1581,7 +1461,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
cursor += CLUSTER_SIZE;
vma->vm_private_data = (void *) cursor;
if ((int)mapcount <= 0)
- goto out;
+ return ret;
}
vma->vm_private_data = (void *) max_nl_cursor;
}
@@ -1596,11 +1476,34 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
*/
list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
vma->vm_private_data = NULL;
-out:
- mutex_unlock(&mapping->i_mmap_mutex);
+
return ret;
}
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
+{
+ int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
+
+ if (!maybe_stack)
+ return false;
+
+ if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
+ VM_STACK_INCOMPLETE_SETUP)
+ return true;
+
+ return false;
+}
+
+static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
+{
+ return is_vma_temporary_stack(vma);
+}
+
+static int page_not_mapped(struct page *page)
+{
+ return !page_mapped(page);
+};
+
/**
* try_to_unmap - try to remove all page table mappings to a page
* @page: the page to get unmapped
@@ -1618,16 +1521,29 @@ out:
int try_to_unmap(struct page *page, enum ttu_flags flags)
{
int ret;
+ struct rmap_walk_control rwc = {
+ .rmap_one = try_to_unmap_one,
+ .arg = (void *)flags,
+ .done = page_not_mapped,
+ .file_nonlinear = try_to_unmap_nonlinear,
+ .anon_lock = page_lock_anon_vma_read,
+ };
- BUG_ON(!PageLocked(page));
- VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
+ VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page);
+
+ /*
+ * During exec, a temporary VMA is setup and later moved.
+ * The VMA is moved under the anon_vma lock but not the
+ * page tables leading to a race where migration cannot
+ * find the migration ptes. Rather than increasing the
+ * locking requirements of exec(), migration skips
+ * temporary VMAs until after exec() completes.
+ */
+ if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
+ rwc.invalid_vma = invalid_migration_vma;
+
+ ret = rmap_walk(page, &rwc);
- if (unlikely(PageKsm(page)))
- ret = try_to_unmap_ksm(page, flags);
- else if (PageAnon(page))
- ret = try_to_unmap_anon(page, flags);
- else
- ret = try_to_unmap_file(page, flags);
if (ret != SWAP_MLOCK && !page_mapped(page))
ret = SWAP_SUCCESS;
return ret;
@@ -1650,38 +1566,43 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
*/
int try_to_munlock(struct page *page)
{
- VM_BUG_ON(!PageLocked(page) || PageLRU(page));
+ int ret;
+ struct rmap_walk_control rwc = {
+ .rmap_one = try_to_unmap_one,
+ .arg = (void *)TTU_MUNLOCK,
+ .done = page_not_mapped,
+ /*
+ * We don't bother to try to find the munlocked page in
+ * nonlinears. It's costly. Instead, later, page reclaim logic
+ * may call try_to_unmap() and recover PG_mlocked lazily.
+ */
+ .file_nonlinear = NULL,
+ .anon_lock = page_lock_anon_vma_read,
- if (unlikely(PageKsm(page)))
- return try_to_unmap_ksm(page, TTU_MUNLOCK);
- else if (PageAnon(page))
- return try_to_unmap_anon(page, TTU_MUNLOCK);
- else
- return try_to_unmap_file(page, TTU_MUNLOCK);
+ };
+
+ VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
+
+ ret = rmap_walk(page, &rwc);
+ return ret;
}
void __put_anon_vma(struct anon_vma *anon_vma)
{
struct anon_vma *root = anon_vma->root;
+ anon_vma_free(anon_vma);
if (root != anon_vma && atomic_dec_and_test(&root->refcount))
anon_vma_free(root);
-
- anon_vma_free(anon_vma);
}
-#ifdef CONFIG_MIGRATION
-/*
- * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
- * Called by migrate.c to remove migration ptes, but might be used more later.
- */
-static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
- struct vm_area_struct *, unsigned long, void *), void *arg)
+static struct anon_vma *rmap_walk_anon_lock(struct page *page,
+ struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct anon_vma_chain *avc;
- int ret = SWAP_AGAIN;
+
+ if (rwc->anon_lock)
+ return rwc->anon_lock(page);
/*
* Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
@@ -1691,58 +1612,120 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
*/
anon_vma = page_anon_vma(page);
if (!anon_vma)
- return ret;
+ return NULL;
+
anon_vma_lock_read(anon_vma);
+ return anon_vma;
+}
+
+/*
+ * rmap_walk_anon - do something to anonymous page using the object-based
+ * rmap method
+ * @page: the page to be handled
+ * @rwc: control variable according to each walk type
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the anon_vma struct it points to.
+ *
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write. So, we won't recheck
+ * vm_flags for that VMA. That should be OK, because that vma shouldn't be
+ * LOCKED.
+ */
+static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
+{
+ struct anon_vma *anon_vma;
+ pgoff_t pgoff = page_to_pgoff(page);
+ struct anon_vma_chain *avc;
+ int ret = SWAP_AGAIN;
+
+ anon_vma = rmap_walk_anon_lock(page, rwc);
+ if (!anon_vma)
+ return ret;
+
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
- ret = rmap_one(page, vma, address, arg);
+
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+ continue;
+
+ ret = rwc->rmap_one(page, vma, address, rwc->arg);
if (ret != SWAP_AGAIN)
break;
+ if (rwc->done && rwc->done(page))
+ break;
}
anon_vma_unlock_read(anon_vma);
return ret;
}
-static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
- struct vm_area_struct *, unsigned long, void *), void *arg)
+/*
+ * rmap_walk_file - do something to file page using the object-based rmap method
+ * @page: the page to be handled
+ * @rwc: control variable according to each walk type
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the address_space struct it points to.
+ *
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write. So, we won't recheck
+ * vm_flags for that VMA. That should be OK, because that vma shouldn't be
+ * LOCKED.
+ */
+static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
{
struct address_space *mapping = page->mapping;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ pgoff_t pgoff = page_to_pgoff(page);
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
+ /*
+ * The page lock not only makes sure that page->mapping cannot
+ * suddenly be NULLified by truncation, it makes sure that the
+ * structure at mapping cannot be freed and reused yet,
+ * so we can safely take mapping->i_mmap_mutex.
+ */
+ VM_BUG_ON(!PageLocked(page));
+
if (!mapping)
return ret;
mutex_lock(&mapping->i_mmap_mutex);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
- ret = rmap_one(page, vma, address, arg);
+
+ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+ continue;
+
+ ret = rwc->rmap_one(page, vma, address, rwc->arg);
if (ret != SWAP_AGAIN)
- break;
+ goto done;
+ if (rwc->done && rwc->done(page))
+ goto done;
}
- /*
- * No nonlinear handling: being always shared, nonlinear vmas
- * never contain migration ptes. Decide what to do about this
- * limitation to linear when we need rmap_walk() on nonlinear.
- */
+
+ if (!rwc->file_nonlinear)
+ goto done;
+
+ if (list_empty(&mapping->i_mmap_nonlinear))
+ goto done;
+
+ ret = rwc->file_nonlinear(page, mapping, rwc->arg);
+
+done:
mutex_unlock(&mapping->i_mmap_mutex);
return ret;
}
-int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
- struct vm_area_struct *, unsigned long, void *), void *arg)
+int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
{
- VM_BUG_ON(!PageLocked(page));
-
if (unlikely(PageKsm(page)))
- return rmap_walk_ksm(page, rmap_one, arg);
+ return rmap_walk_ksm(page, rwc);
else if (PageAnon(page))
- return rmap_walk_anon(page, rmap_one, arg);
+ return rmap_walk_anon(page, rwc);
else
- return rmap_walk_file(page, rmap_one, arg);
+ return rmap_walk_file(page, rwc);
}
-#endif /* CONFIG_MIGRATION */
#ifdef CONFIG_HUGETLB_PAGE
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 8297623fcae..af68b15a8fc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,7 +45,7 @@ static struct vfsmount *shm_mnt;
#include <linux/xattr.h>
#include <linux/exportfs.h>
#include <linux/posix_acl.h>
-#include <linux/generic_acl.h>
+#include <linux/posix_acl_xattr.h>
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/slab.h>
@@ -80,11 +80,12 @@ static struct vfsmount *shm_mnt;
#define SHORT_SYMLINK_LEN 128
/*
- * shmem_fallocate and shmem_writepage communicate via inode->i_private
- * (with i_mutex making sure that it has only one user at a time):
- * we would prefer not to enlarge the shmem inode just for that.
+ * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * inode->i_private (with i_mutex making sure that it has only one user at
+ * a time): we would prefer not to enlarge the shmem inode just for that.
*/
struct shmem_falloc {
+ wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
pgoff_t start; /* start of range currently being fallocated */
pgoff_t next; /* the next page offset to be fallocated */
pgoff_t nr_falloced; /* how many new pages have been fallocated */
@@ -242,19 +243,17 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
void **pslot;
- void *item = NULL;
+ void *item;
VM_BUG_ON(!expected);
+ VM_BUG_ON(!replacement);
pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
- if (pslot)
- item = radix_tree_deref_slot_protected(pslot,
- &mapping->tree_lock);
+ if (!pslot)
+ return -ENOENT;
+ item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
if (item != expected)
return -ENOENT;
- if (replacement)
- radix_tree_replace_slot(pslot, replacement);
- else
- radix_tree_delete(&mapping->page_tree, index);
+ radix_tree_replace_slot(pslot, replacement);
return 0;
}
@@ -285,8 +284,8 @@ static int shmem_add_to_page_cache(struct page *page,
{
int error;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(!PageSwapBacked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
page_cache_get(page);
page->mapping = mapping;
@@ -331,84 +330,20 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
}
/*
- * Like find_get_pages, but collecting swap entries as well as pages.
- */
-static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
- pgoff_t start, unsigned int nr_pages,
- struct page **pages, pgoff_t *indices)
-{
- void **slot;
- unsigned int ret = 0;
- struct radix_tree_iter iter;
-
- if (!nr_pages)
- return 0;
-
- rcu_read_lock();
-restart:
- radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
- struct page *page;
-repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
- continue;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
- /*
- * Otherwise, we must be storing a swap entry
- * here as an exceptional entry: so return it
- * without attempting to raise page count.
- */
- goto export;
- }
- if (!page_cache_get_speculative(page))
- goto repeat;
-
- /* Has the page moved? */
- if (unlikely(page != *slot)) {
- page_cache_release(page);
- goto repeat;
- }
-export:
- indices[ret] = iter.index;
- pages[ret] = page;
- if (++ret == nr_pages)
- break;
- }
- rcu_read_unlock();
- return ret;
-}
-
-/*
* Remove swap entry from radix tree, free the swap and its page cache.
*/
static int shmem_free_swap(struct address_space *mapping,
pgoff_t index, void *radswap)
{
- int error;
+ void *old;
spin_lock_irq(&mapping->tree_lock);
- error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
+ old = radix_tree_delete_item(&mapping->page_tree, index, radswap);
spin_unlock_irq(&mapping->tree_lock);
- if (!error)
- free_swap_and_cache(radix_to_swp_entry(radswap));
- return error;
-}
-
-/*
- * Pagevec may contain swap entries, so shuffle up pages before releasing.
- */
-static void shmem_deswap_pagevec(struct pagevec *pvec)
-{
- int i, j;
-
- for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- if (!radix_tree_exceptional_entry(page))
- pvec->pages[j++] = page;
- }
- pvec->nr = j;
+ if (old != radswap)
+ return -ENOENT;
+ free_swap_and_cache(radix_to_swp_entry(radswap));
+ return 0;
}
/*
@@ -429,12 +364,12 @@ void shmem_unlock_mapping(struct address_space *mapping)
* Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
* has finished, if it hits a row of PAGEVEC_SIZE swap entries.
*/
- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
- PAGEVEC_SIZE, pvec.pages, indices);
+ pvec.nr = find_get_entries(mapping, index,
+ PAGEVEC_SIZE, pvec.pages, indices);
if (!pvec.nr)
break;
index = indices[pvec.nr - 1] + 1;
- shmem_deswap_pagevec(&pvec);
+ pagevec_remove_exceptionals(&pvec);
check_move_unevictable_pages(pvec.pages, pvec.nr);
pagevec_release(&pvec);
cond_resched();
@@ -466,9 +401,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
pagevec_init(&pvec, 0);
index = start;
while (index < end) {
- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- pvec.pages, indices);
+ pvec.nr = find_get_entries(mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ pvec.pages, indices);
if (!pvec.nr)
break;
mem_cgroup_uncharge_start();
@@ -491,13 +426,13 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
continue;
if (!unfalloc || !PageUptodate(page)) {
if (page->mapping == mapping) {
- VM_BUG_ON(PageWriteback(page));
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
truncate_inode_page(mapping, page);
}
}
unlock_page(page);
}
- shmem_deswap_pagevec(&pvec);
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
cond_resched();
@@ -533,22 +468,20 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
return;
index = start;
- for ( ; ; ) {
+ while (index < end) {
cond_resched();
- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+
+ pvec.nr = find_get_entries(mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE),
- pvec.pages, indices);
+ pvec.pages, indices);
if (!pvec.nr) {
- if (index == start || unfalloc)
+ /* If all gone or hole-punch or unfalloc, we're done */
+ if (index == start || end != -1)
break;
+ /* But if truncating, restart to make sure all gone */
index = start;
continue;
}
- if ((index == start || unfalloc) && indices[0] >= end) {
- shmem_deswap_pagevec(&pvec);
- pagevec_release(&pvec);
- break;
- }
mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@ -560,21 +493,30 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (radix_tree_exceptional_entry(page)) {
if (unfalloc)
continue;
- nr_swaps_freed += !shmem_free_swap(mapping,
- index, page);
+ if (shmem_free_swap(mapping, index, page)) {
+ /* Swap was replaced by page: retry */
+ index--;
+ break;
+ }
+ nr_swaps_freed++;
continue;
}
lock_page(page);
if (!unfalloc || !PageUptodate(page)) {
if (page->mapping == mapping) {
- VM_BUG_ON(PageWriteback(page));
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
truncate_inode_page(mapping, page);
+ } else {
+ /* Page was replaced by swap: retry */
+ unlock_page(page);
+ index--;
+ break;
}
}
unlock_page(page);
}
- shmem_deswap_pagevec(&pvec);
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
mem_cgroup_uncharge_end();
index++;
@@ -620,10 +562,8 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
}
setattr_copy(inode, attr);
-#ifdef CONFIG_TMPFS_POSIX_ACL
if (attr->ia_valid & ATTR_MODE)
- error = generic_acl_chmod(inode);
-#endif
+ error = posix_acl_chmod(inode, inode->i_mode);
return error;
}
@@ -750,7 +690,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
*/
- error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
+ error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL);
if (error)
goto out;
/* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -826,6 +766,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
spin_lock(&inode->i_lock);
shmem_falloc = inode->i_private;
if (shmem_falloc &&
+ !shmem_falloc->waitq &&
index >= shmem_falloc->start &&
index < shmem_falloc->next)
shmem_falloc->nr_unswapped++;
@@ -1082,7 +1023,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
return -EFBIG;
repeat:
swap.val = 0;
- page = find_lock_page(mapping, index);
+ page = find_lock_entry(mapping, index);
if (radix_tree_exceptional_entry(page)) {
swap = radix_to_swp_entry(page);
page = NULL;
@@ -1094,6 +1035,9 @@ repeat:
goto failed;
}
+ if (page && sgp == SGP_WRITE)
+ mark_page_accessed(page);
+
/* fallocated page? */
if (page && !PageUptodate(page)) {
if (sgp != SGP_READ)
@@ -1147,7 +1091,7 @@ repeat:
goto failed;
}
- error = mem_cgroup_cache_charge(page, current->mm,
+ error = mem_cgroup_charge_file(page, current->mm,
gfp & GFP_RECLAIM_MASK);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
@@ -1175,6 +1119,9 @@ repeat:
shmem_recalc_inode(inode);
spin_unlock(&info->lock);
+ if (sgp == SGP_WRITE)
+ mark_page_accessed(page);
+
delete_from_swap_cache(page);
set_page_dirty(page);
swap_free(swap);
@@ -1199,9 +1146,12 @@ repeat:
goto decused;
}
- SetPageSwapBacked(page);
+ __SetPageSwapBacked(page);
__set_page_locked(page);
- error = mem_cgroup_cache_charge(page, current->mm,
+ if (sgp == SGP_WRITE)
+ init_page_accessed(page);
+
+ error = mem_cgroup_charge_file(page, current->mm,
gfp & GFP_RECLAIM_MASK);
if (error)
goto decused;
@@ -1300,6 +1250,64 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
int error;
int ret = VM_FAULT_LOCKED;
+ /*
+ * Trinity finds that probing a hole which tmpfs is punching can
+ * prevent the hole-punch from ever completing: which in turn
+ * locks writers out with its hold on i_mutex. So refrain from
+ * faulting pages into the hole while it's being punched. Although
+ * shmem_undo_range() does remove the additions, it may be unable to
+ * keep up, as each new page needs its own unmap_mapping_range() call,
+ * and the i_mmap tree grows ever slower to scan if new vmas are added.
+ *
+ * It does not matter if we sometimes reach this check just before the
+ * hole-punch begins, so that one fault then races with the punch:
+ * we just need to make racing faults a rare case.
+ *
+ * The implementation below would be much simpler if we just used a
+ * standard mutex or completion: but we cannot take i_mutex in fault,
+ * and bloating every shmem inode for this unlikely case would be sad.
+ */
+ if (unlikely(inode->i_private)) {
+ struct shmem_falloc *shmem_falloc;
+
+ spin_lock(&inode->i_lock);
+ shmem_falloc = inode->i_private;
+ if (shmem_falloc &&
+ shmem_falloc->waitq &&
+ vmf->pgoff >= shmem_falloc->start &&
+ vmf->pgoff < shmem_falloc->next) {
+ wait_queue_head_t *shmem_falloc_waitq;
+ DEFINE_WAIT(shmem_fault_wait);
+
+ ret = VM_FAULT_NOPAGE;
+ if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
+ !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ /* It's polite to up mmap_sem if we can */
+ up_read(&vma->vm_mm->mmap_sem);
+ ret = VM_FAULT_RETRY;
+ }
+
+ shmem_falloc_waitq = shmem_falloc->waitq;
+ prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
+ TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode->i_lock);
+ schedule();
+
+ /*
+ * shmem_falloc_waitq points into the shmem_fallocate()
+ * stack of the hole-punching task: shmem_falloc_waitq
+ * is usually invalid by the time we reach here, but
+ * finish_wait() does not dereference it in that case;
+ * though i_lock needed lest racing with wake_up_all().
+ */
+ spin_lock(&inode->i_lock);
+ finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
+ spin_unlock(&inode->i_lock);
+ return ret;
+ }
+ spin_unlock(&inode->i_lock);
+ }
+
error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
if (error)
return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1419,6 +1427,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
return inode;
}
+bool shmem_mapping(struct address_space *mapping)
+{
+ return mapping->backing_dev_info == &shmem_backing_dev_info;
+}
+
#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
@@ -1464,13 +1477,17 @@ shmem_write_end(struct file *file, struct address_space *mapping,
return copied;
}
-static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
+static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct inode *inode = file_inode(filp);
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
pgoff_t index;
unsigned long offset;
enum sgp_type sgp = SGP_READ;
+ int error = 0;
+ ssize_t retval = 0;
+ loff_t *ppos = &iocb->ki_pos;
/*
* Might this read be for a stacking filesystem? Then when reading
@@ -1498,10 +1515,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
break;
}
- desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
- if (desc->error) {
- if (desc->error == -EINVAL)
- desc->error = 0;
+ error = shmem_getpage(inode, index, &page, sgp, NULL);
+ if (error) {
+ if (error == -EINVAL)
+ error = 0;
break;
}
if (page)
@@ -1545,61 +1562,26 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
- *
- * The actor routine returns how many bytes were actually used..
- * NOTE! This may not be the same as how much of a user buffer
- * we filled up (we may be padding etc), so we can only update
- * "pos" here (the actor routine has to update the user buffer
- * pointers and the remaining count).
*/
- ret = actor(desc, page, offset, nr);
+ ret = copy_page_to_iter(page, offset, nr, to);
+ retval += ret;
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
page_cache_release(page);
- if (ret != nr || !desc->count)
+ if (!iov_iter_count(to))
break;
-
+ if (ret < nr) {
+ error = -EFAULT;
+ break;
+ }
cond_resched();
}
*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
- file_accessed(filp);
-}
-
-static ssize_t shmem_file_aio_read(struct kiocb *iocb,
- const struct iovec *iov, unsigned long nr_segs, loff_t pos)
-{
- struct file *filp = iocb->ki_filp;
- ssize_t retval;
- unsigned long seg;
- size_t count;
- loff_t *ppos = &iocb->ki_pos;
-
- retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
- if (retval)
- return retval;
-
- for (seg = 0; seg < nr_segs; seg++) {
- read_descriptor_t desc;
-
- desc.written = 0;
- desc.arg.buf = iov[seg].iov_base;
- desc.count = iov[seg].iov_len;
- if (desc.count == 0)
- continue;
- desc.error = 0;
- do_shmem_file_read(filp, ppos, &desc, file_read_actor);
- retval += desc.written;
- if (desc.error) {
- retval = retval ?: desc.error;
- break;
- }
- if (desc.count > 0)
- break;
- }
- return retval;
+ file_accessed(file);
+ return retval ? retval : error;
}
static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
@@ -1638,7 +1620,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
index = *ppos >> PAGE_CACHE_SHIFT;
loff = *ppos & ~PAGE_CACHE_MASK;
req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- nr_pages = min(req_pages, pipe->buffers);
+ nr_pages = min(req_pages, spd.nr_pages_max);
spd.nr_pages = find_get_pages_contig(mapping, index,
nr_pages, spd.pages);
@@ -1731,7 +1713,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
pagevec_init(&pvec, 0);
pvec.nr = 1; /* start small: we may be there already */
while (!done) {
- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+ pvec.nr = find_get_entries(mapping, index,
pvec.nr, pvec.pages, indices);
if (!pvec.nr) {
if (whence == SEEK_DATA)
@@ -1758,7 +1740,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
break;
}
}
- shmem_deswap_pagevec(&pvec);
+ pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
pvec.nr = PAGEVEC_SIZE;
cond_resched();
@@ -1813,18 +1795,34 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
pgoff_t start, index, end;
int error;
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
mutex_lock(&inode->i_mutex);
if (mode & FALLOC_FL_PUNCH_HOLE) {
struct address_space *mapping = file->f_mapping;
loff_t unmap_start = round_up(offset, PAGE_SIZE);
loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
+
+ shmem_falloc.waitq = &shmem_falloc_waitq;
+ shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+ shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
+ spin_lock(&inode->i_lock);
+ inode->i_private = &shmem_falloc;
+ spin_unlock(&inode->i_lock);
if ((u64)unmap_end > (u64)unmap_start)
unmap_mapping_range(mapping, unmap_start,
1 + unmap_end - unmap_start, 0);
shmem_truncate_range(inode, offset, offset + len - 1);
/* No need to unmap again: hole-punching leaves COWed pages */
+
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ wake_up_all(&shmem_falloc_waitq);
+ spin_unlock(&inode->i_lock);
error = 0;
goto out;
}
@@ -1842,6 +1840,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
goto out;
}
+ shmem_falloc.waitq = NULL;
shmem_falloc.start = start;
shmem_falloc.next = start;
shmem_falloc.nr_falloced = 0;
@@ -1937,22 +1936,14 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
-#ifdef CONFIG_TMPFS_POSIX_ACL
- error = generic_acl_init(inode, dir);
- if (error) {
- iput(inode);
- return error;
- }
-#endif
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
error = security_inode_init_security(inode, dir,
&dentry->d_name,
shmem_initxattrs, NULL);
- if (error) {
- if (error != -EOPNOTSUPP) {
- iput(inode);
- return error;
- }
- }
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
error = 0;
dir->i_size += BOGO_DIRENT_SIZE;
@@ -1961,6 +1952,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
dget(dentry); /* Extra count - pin the dentry in core */
}
return error;
+out_iput:
+ iput(inode);
+ return error;
}
static int
@@ -1974,24 +1968,17 @@ shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
error = security_inode_init_security(inode, dir,
NULL,
shmem_initxattrs, NULL);
- if (error) {
- if (error != -EOPNOTSUPP) {
- iput(inode);
- return error;
- }
- }
-#ifdef CONFIG_TMPFS_POSIX_ACL
- error = generic_acl_init(inode, dir);
- if (error) {
- iput(inode);
- return error;
- }
-#else
- error = 0;
-#endif
+ if (error && error != -EOPNOTSUPP)
+ goto out_iput;
+ error = simple_acl_create(dir, inode);
+ if (error)
+ goto out_iput;
d_tmpfile(dentry, inode);
}
return error;
+out_iput:
+ iput(inode);
+ return error;
}
static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -2223,8 +2210,8 @@ static int shmem_initxattrs(struct inode *inode,
static const struct xattr_handler *shmem_xattr_handlers[] = {
#ifdef CONFIG_TMPFS_POSIX_ACL
- &generic_acl_access_handler,
- &generic_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
NULL
};
@@ -2722,13 +2709,13 @@ static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
.llseek = shmem_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = shmem_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = shmem_file_read_iter,
+ .write_iter = generic_file_write_iter,
.fsync = noop_fsync,
.splice_read = shmem_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.fallocate = shmem_fallocate,
#endif
};
@@ -2740,6 +2727,7 @@ static const struct inode_operations shmem_inode_operations = {
.getxattr = shmem_getxattr,
.listxattr = shmem_listxattr,
.removexattr = shmem_removexattr,
+ .set_acl = simple_set_acl,
#endif
};
@@ -2764,6 +2752,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_setattr,
+ .set_acl = simple_set_acl,
#endif
};
@@ -2776,6 +2765,7 @@ static const struct inode_operations shmem_special_inode_operations = {
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_setattr,
+ .set_acl = simple_set_acl,
#endif
};
@@ -2794,6 +2784,7 @@ static const struct super_operations shmem_ops = {
static const struct vm_operations_struct shmem_vm_ops = {
.fault = shmem_fault,
+ .map_pages = filemap_map_pages,
#ifdef CONFIG_NUMA
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
@@ -2918,13 +2909,8 @@ static struct dentry_operations anon_ops = {
.d_dname = simple_dname
};
-/**
- * shmem_file_setup - get an unlinked file living in tmpfs
- * @name: name for dentry (to be seen in /proc/<pid>/maps
- * @size: size to be set for the file
- * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
- */
-struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
+static struct file *__shmem_file_setup(const char *name, loff_t size,
+ unsigned long flags, unsigned int i_flags)
{
struct file *res;
struct inode *inode;
@@ -2957,6 +2943,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
if (!inode)
goto put_dentry;
+ inode->i_flags |= i_flags;
d_instantiate(path.dentry, inode);
inode->i_size = size;
clear_nlink(inode); /* It is unlinked */
@@ -2977,6 +2964,32 @@ put_memory:
shmem_unacct_size(flags, size);
return res;
}
+
+/**
+ * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
+ * kernel internal. There will be NO LSM permission checks against the
+ * underlying inode. So users of this interface must do LSM checks at a
+ * higher layer. The one user is the big_key implementation. LSM checks
+ * are provided at the key level rather than the inode level.
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
+ */
+struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
+{
+ return __shmem_file_setup(name, size, flags, S_PRIVATE);
+}
+
+/**
+ * shmem_file_setup - get an unlinked file living in tmpfs
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
+ */
+struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
+{
+ return __shmem_file_setup(name, size, flags, 0);
+}
EXPORT_SYMBOL_GPL(shmem_file_setup);
/**
diff --git a/mm/slab.c b/mm/slab.c
index eb043bf05f4..3070b929a1b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -157,6 +157,17 @@
#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
#endif
+#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \
+ <= SLAB_OBJ_MIN_SIZE) ? 1 : 0)
+
+#if FREELIST_BYTE_INDEX
+typedef unsigned char freelist_idx_t;
+#else
+typedef unsigned short freelist_idx_t;
+#endif
+
+#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
+
/*
* true if a page was allocated from pfmemalloc reserves for network-based
* swap
@@ -277,8 +288,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
* OTOH the cpuarrays can contain lots of objects,
* which could lock up otherwise freeable slabs.
*/
-#define REAPTIMEOUT_CPUC (2*HZ)
-#define REAPTIMEOUT_LIST3 (4*HZ)
+#define REAPTIMEOUT_AC (2*HZ)
+#define REAPTIMEOUT_NODE (4*HZ)
#if STATS
#define STATS_INC_ACTIVE(x) ((x)->num_active++)
@@ -375,6 +386,39 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
#endif
+#define OBJECT_FREE (0)
+#define OBJECT_ACTIVE (1)
+
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+
+static void set_obj_status(struct page *page, int idx, int val)
+{
+ int freelist_size;
+ char *status;
+ struct kmem_cache *cachep = page->slab_cache;
+
+ freelist_size = cachep->num * sizeof(freelist_idx_t);
+ status = (char *)page->freelist + freelist_size;
+ status[idx] = val;
+}
+
+static inline unsigned int get_obj_status(struct page *page, int idx)
+{
+ int freelist_size;
+ char *status;
+ struct kmem_cache *cachep = page->slab_cache;
+
+ freelist_size = cachep->num * sizeof(freelist_idx_t);
+ status = (char *)page->freelist + freelist_size;
+
+ return status[idx];
+}
+
+#else
+static inline void set_obj_status(struct page *page, int idx, int val) {}
+
+#endif
+
/*
* Do not go above this order unless 0 objects fit into the slab or
* overridden on the command line.
@@ -565,9 +609,50 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
return cachep->array[smp_processor_id()];
}
-static size_t slab_mgmt_size(size_t nr_objs, size_t align)
+static size_t calculate_freelist_size(int nr_objs, size_t align)
{
- return ALIGN(nr_objs * sizeof(unsigned int), align);
+ size_t freelist_size;
+
+ freelist_size = nr_objs * sizeof(freelist_idx_t);
+ if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+ freelist_size += nr_objs * sizeof(char);
+
+ if (align)
+ freelist_size = ALIGN(freelist_size, align);
+
+ return freelist_size;
+}
+
+static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
+ size_t idx_size, size_t align)
+{
+ int nr_objs;
+ size_t remained_size;
+ size_t freelist_size;
+ int extra_space = 0;
+
+ if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+ extra_space = sizeof(char);
+ /*
+ * Ignore padding for the initial guess. The padding
+ * is at most @align-1 bytes, and @buffer_size is at
+ * least @align. In the worst case, this result will
+ * be one greater than the number of objects that fit
+ * into the memory allocation when taking the padding
+ * into account.
+ */
+ nr_objs = slab_size / (buffer_size + idx_size + extra_space);
+
+ /*
+ * This calculated number will be either the right
+ * amount, or one greater than what we want.
+ */
+ remained_size = slab_size - nr_objs * buffer_size;
+ freelist_size = calculate_freelist_size(nr_objs, align);
+ if (remained_size < freelist_size)
+ nr_objs--;
+
+ return nr_objs;
}
/*
@@ -600,25 +685,9 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
nr_objs = slab_size / buffer_size;
} else {
- /*
- * Ignore padding for the initial guess. The padding
- * is at most @align-1 bytes, and @buffer_size is at
- * least @align. In the worst case, this result will
- * be one greater than the number of objects that fit
- * into the memory allocation when taking the padding
- * into account.
- */
- nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int));
-
- /*
- * This calculated number will be either the right
- * amount, or one greater than what we want.
- */
- if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
- > slab_size)
- nr_objs--;
-
- mgmt_size = slab_mgmt_size(nr_objs, align);
+ nr_objs = calculate_nr_objs(slab_size, buffer_size,
+ sizeof(freelist_idx_t), align);
+ mgmt_size = calculate_freelist_size(nr_objs, align);
}
*num = nr_objs;
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
@@ -1067,7 +1136,7 @@ static int init_cache_node_node(int node)
list_for_each_entry(cachep, &slab_caches, list) {
/*
- * Set up the size64 kmemlist for cpu before we can
+ * Set up the kmem_cache_node for cpu before we can
* begin anything. Make sure some other cpu on this
* node has not already allocated this
*/
@@ -1076,12 +1145,12 @@ static int init_cache_node_node(int node)
if (!n)
return -ENOMEM;
kmem_cache_node_init(n);
- n->next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ n->next_reap = jiffies + REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
/*
- * The l3s don't come and go as CPUs come and
- * go. slab_mutex is sufficient
+ * The kmem_cache_nodes don't come and go as CPUs
+ * come and go. slab_mutex is sufficient
* protection here.
*/
cachep->node[node] = n;
@@ -1406,8 +1475,8 @@ static void __init set_up_node(struct kmem_cache *cachep, int index)
for_each_online_node(node) {
cachep->node[node] = &init_kmem_cache_node[index + node];
cachep->node[node]->next_reap = jiffies +
- REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
}
}
@@ -1604,10 +1673,16 @@ __initcall(cpucache_init);
static noinline void
slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
{
+#if DEBUG
struct kmem_cache_node *n;
struct page *page;
unsigned long flags;
int node;
+ static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
+ return;
printk(KERN_WARNING
"SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
@@ -1645,6 +1720,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
node, active_slabs, num_slabs, active_objs, num_objs,
free_objects);
}
+#endif
}
/*
@@ -1664,10 +1740,13 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
flags |= __GFP_RECLAIMABLE;
+ if (memcg_charge_slab(cachep, flags, cachep->gfporder))
+ return NULL;
+
page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
if (!page) {
- if (!(flags & __GFP_NOWARN) && printk_ratelimit())
- slab_out_of_memory(cachep, flags, nodeid);
+ memcg_uncharge_slab(cachep, cachep->gfporder);
+ slab_out_of_memory(cachep, flags, nodeid);
return NULL;
}
@@ -1685,7 +1764,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
__SetPageSlab(page);
if (page->pfmemalloc)
SetPageSlabPfmemalloc(page);
- memcg_bind_pages(cachep, cachep->gfporder);
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1721,10 +1799,10 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
page_mapcount_reset(page);
page->mapping = NULL;
- memcg_release_pages(cachep, cachep->gfporder);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
- __free_memcg_kmem_pages(page, cachep->gfporder);
+ __free_pages(page, cachep->gfporder);
+ memcg_uncharge_slab(cachep, cachep->gfporder);
}
static void kmem_rcu_free(struct rcu_head *head)
@@ -1946,7 +2024,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
/**
* slab_destroy - destroy and release all objects in a slab
* @cachep: cache pointer being destroyed
- * @slabp: slab pointer being destroyed
+ * @page: page pointer being destroyed
*
* Destroy all the objs in a slab, and release the mem back to the system.
* Before calling the slab must have been unlinked from the cache. The
@@ -2010,14 +2088,21 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
if (!num)
continue;
+ /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
+ if (num > SLAB_OBJ_MAX_NUM)
+ break;
+
if (flags & CFLGS_OFF_SLAB) {
+ size_t freelist_size_per_obj = sizeof(freelist_idx_t);
/*
* Max number of objs-per-slab for caches which
* use off-slab slabs. Needed to avoid a possible
* looping condition in cache_grow().
*/
+ if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+ freelist_size_per_obj += sizeof(char);
offslab_limit = size;
- offslab_limit /= sizeof(unsigned int);
+ offslab_limit /= freelist_size_per_obj;
if (num > offslab_limit)
break;
@@ -2103,8 +2188,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
}
}
cachep->node[numa_mem_id()]->next_reap =
- jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ jiffies + REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
cpu_cache_get(cachep)->avail = 0;
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
@@ -2243,7 +2328,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
* it too early on. Always use on-slab management when
* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
- if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
+ if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&
!(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
@@ -2252,14 +2337,19 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
flags |= CFLGS_OFF_SLAB;
size = ALIGN(size, cachep->align);
+ /*
+ * We should restrict the number of objects in a slab to implement
+ * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
+ */
+ if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
+ size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
left_over = calculate_slab_order(cachep, size, cachep->align, flags);
if (!cachep->num)
return -E2BIG;
- freelist_size =
- ALIGN(cachep->num * sizeof(unsigned int), cachep->align);
+ freelist_size = calculate_freelist_size(cachep->num, cachep->align);
/*
* If the slab has been placed off-slab, and we have enough space then
@@ -2272,7 +2362,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
- freelist_size = cachep->num * sizeof(unsigned int);
+ freelist_size = calculate_freelist_size(cachep->num, 0);
#ifdef CONFIG_PAGE_POISONING
/* If we're going to use the generic kernel_map_pages()
@@ -2300,10 +2390,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (flags & CFLGS_OFF_SLAB) {
cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
/*
- * This is a possibility for one of the malloc_sizes caches.
+ * This is a possibility for one of the kmalloc_{dma,}_caches.
* But since we go off slab only for object size greater than
- * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
- * this should not happen at all.
+ * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
+ * in ascending order,this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
@@ -2442,8 +2532,7 @@ out:
return nr_freed;
}
-/* Called with slab_mutex held to protect against cpu hotplug */
-static int __cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shrink(struct kmem_cache *cachep)
{
int ret = 0, i = 0;
struct kmem_cache_node *n;
@@ -2464,32 +2553,11 @@ static int __cache_shrink(struct kmem_cache *cachep)
return (ret ? 1 : 0);
}
-/**
- * kmem_cache_shrink - Shrink a cache.
- * @cachep: The cache to shrink.
- *
- * Releases as many slabs as possible for a cache.
- * To help debugging, a zero exit status indicates all slabs were released.
- */
-int kmem_cache_shrink(struct kmem_cache *cachep)
-{
- int ret;
- BUG_ON(!cachep || in_interrupt());
-
- get_online_cpus();
- mutex_lock(&slab_mutex);
- ret = __cache_shrink(cachep);
- mutex_unlock(&slab_mutex);
- put_online_cpus();
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_shrink);
-
int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
int i;
struct kmem_cache_node *n;
- int rc = __cache_shrink(cachep);
+ int rc = __kmem_cache_shrink(cachep);
if (rc)
return rc;
@@ -2511,14 +2579,17 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
/*
* Get the memory for a slab management obj.
- * For a slab cache when the slab descriptor is off-slab, slab descriptors
- * always come from malloc_sizes caches. The slab descriptor cannot
- * come from the same cache which is getting created because,
- * when we are searching for an appropriate cache for these
- * descriptors in kmem_cache_create, we search through the malloc_sizes array.
- * If we are creating a malloc_sizes cache here it would not be visible to
- * kmem_find_general_cachep till the initialization is complete.
- * Hence we cannot have freelist_cache same as the original cache.
+ *
+ * For a slab cache when the slab descriptor is off-slab, the
+ * slab descriptor can't come from the same cache which is being created,
+ * Because if it is the case, that means we defer the creation of
+ * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
+ * And we eventually call down to __kmem_cache_create(), which
+ * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
+ * This is a "chicken-and-egg" problem.
+ *
+ * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
+ * which are all initialized during kmem_cache_init().
*/
static void *alloc_slabmgmt(struct kmem_cache *cachep,
struct page *page, int colour_off,
@@ -2542,9 +2613,15 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
return freelist;
}
-static inline unsigned int *slab_freelist(struct page *page)
+static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx)
+{
+ return ((freelist_idx_t *)page->freelist)[idx];
+}
+
+static inline void set_free_obj(struct page *page,
+ unsigned int idx, freelist_idx_t val)
{
- return (unsigned int *)(page->freelist);
+ ((freelist_idx_t *)(page->freelist))[idx] = val;
}
static void cache_init_objs(struct kmem_cache *cachep,
@@ -2589,7 +2666,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
if (cachep->ctor)
cachep->ctor(objp);
#endif
- slab_freelist(page)[i] = i;
+ set_obj_status(page, i, OBJECT_FREE);
+ set_free_obj(page, i, i);
}
}
@@ -2608,7 +2686,7 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
{
void *objp;
- objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]);
+ objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
page->active++;
#if DEBUG
WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
@@ -2629,7 +2707,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
/* Verify double free bug */
for (i = page->active; i < cachep->num; i++) {
- if (slab_freelist(page)[i] == objnr) {
+ if (get_free_obj(page, i) == objnr) {
printk(KERN_ERR "slab: double free detected in cache "
"'%s', objp %p\n", cachep->name, objp);
BUG();
@@ -2637,7 +2715,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
}
#endif
page->active--;
- slab_freelist(page)[page->active] = objnr;
+ set_free_obj(page, page->active, objnr);
}
/*
@@ -2797,6 +2875,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
BUG_ON(objnr >= cachep->num);
BUG_ON(objp != index_to_obj(cachep, page, objnr));
+ set_obj_status(page, objnr, OBJECT_FREE);
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2886,9 +2965,9 @@ retry:
/* move slabp to correct slabp list: */
list_del(&page->lru);
if (page->active == cachep->num)
- list_add(&page->list, &n->slabs_full);
+ list_add(&page->lru, &n->slabs_full);
else
- list_add(&page->list, &n->slabs_partial);
+ list_add(&page->lru, &n->slabs_partial);
}
must_grow:
@@ -2930,6 +3009,8 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
{
+ struct page *page;
+
if (!objp)
return objp;
if (cachep->flags & SLAB_POISON) {
@@ -2960,6 +3041,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
*dbg_redzone1(cachep, objp) = RED_ACTIVE;
*dbg_redzone2(cachep, objp) = RED_ACTIVE;
}
+
+ page = virt_to_head_page(objp);
+ set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON)
cachep->ctor(objp);
@@ -3027,7 +3111,7 @@ out:
#ifdef CONFIG_NUMA
/*
- * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
+ * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set.
*
* If we are in_interrupt, then process context, including cpusets and
* mempolicy, may not apply and should not be used for allocation policy.
@@ -3042,7 +3126,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
nid_alloc = cpuset_slab_spread_node();
else if (current->mempolicy)
- nid_alloc = slab_node();
+ nid_alloc = mempolicy_slab_node();
if (nid_alloc != nid_here)
return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
@@ -3073,8 +3157,8 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
retry_cpuset:
- cpuset_mems_cookie = get_mems_allowed();
- zonelist = node_zonelist(slab_node(), flags);
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist = node_zonelist(mempolicy_slab_node(), flags);
retry:
/*
@@ -3131,7 +3215,7 @@ retry:
}
}
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+ if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return obj;
}
@@ -3245,11 +3329,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
flags);
- if (likely(ptr))
+ if (likely(ptr)) {
kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
-
- if (unlikely((flags & __GFP_ZERO) && ptr))
- memset(ptr, 0, cachep->object_size);
+ if (unlikely(flags & __GFP_ZERO))
+ memset(ptr, 0, cachep->object_size);
+ }
return ptr;
}
@@ -3259,7 +3343,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
{
void *objp;
- if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
+ if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) {
objp = alternate_node_alloc(cache, flags);
if (objp)
goto out;
@@ -3310,17 +3394,17 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
flags);
prefetchw(objp);
- if (likely(objp))
+ if (likely(objp)) {
kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
-
- if (unlikely((flags & __GFP_ZERO) && objp))
- memset(objp, 0, cachep->object_size);
+ if (unlikely(flags & __GFP_ZERO))
+ memset(objp, 0, cachep->object_size);
+ }
return objp;
}
/*
- * Caller needs to acquire correct kmem_list's list_lock
+ * Caller needs to acquire correct kmem_cache_node's list_lock
*/
static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
int node)
@@ -3574,11 +3658,6 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
struct kmem_cache *cachep;
void *ret;
- /* If you want to save a few bytes .text space: replace
- * __ with kmem_.
- * Then kmalloc uses the uninlined functions instead of the inline
- * functions.
- */
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
@@ -3670,7 +3749,7 @@ EXPORT_SYMBOL(kfree);
/*
* This initializes kmem_cache_node or resizes various caches for all nodes.
*/
-static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
+static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
{
int node;
struct kmem_cache_node *n;
@@ -3726,8 +3805,8 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
}
kmem_cache_node_init(n);
- n->next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ n->next_reap = jiffies + REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
n->shared = new_shared;
n->alien = new_alien;
n->free_limit = (1 + nr_cpus_node(node)) *
@@ -3813,7 +3892,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
kfree(ccold);
}
kfree(new);
- return alloc_kmemlist(cachep, gfp);
+ return alloc_kmem_cache_node(cachep, gfp);
}
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
@@ -3982,7 +4061,7 @@ static void cache_reap(struct work_struct *w)
if (time_after(n->next_reap, jiffies))
goto next;
- n->next_reap = jiffies + REAPTIMEOUT_LIST3;
+ n->next_reap = jiffies + REAPTIMEOUT_NODE;
drain_array(searchp, n, n->shared, 0, node);
@@ -4003,7 +4082,7 @@ next:
next_reap_node();
out:
/* Set up the next iteration */
- schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
+ schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC));
}
#ifdef CONFIG_SLABINFO
@@ -4201,21 +4280,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
struct page *page)
{
void *p;
- int i, j;
+ int i;
if (n[0] == n[1])
return;
for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
- bool active = true;
-
- for (j = page->active; j < c->num; j++) {
- /* Skip freed item */
- if (slab_freelist(page)[j] == i) {
- active = false;
- break;
- }
- }
- if (!active)
+ if (get_obj_status(page, i) != OBJECT_ACTIVE)
continue;
if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
diff --git a/mm/slab.h b/mm/slab.h
index 0859c4241ba..961a3fb1f5a 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -55,12 +55,12 @@ extern void create_boot_cache(struct kmem_cache *, const char *name,
struct mem_cgroup;
#ifdef CONFIG_SLUB
struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *));
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *));
#else
static inline struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
{ return NULL; }
#endif
@@ -91,6 +91,8 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
int __kmem_cache_shutdown(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *);
+void slab_kmem_cache_release(struct kmem_cache *);
struct seq_file;
struct file;
@@ -119,28 +121,6 @@ static inline bool is_root_cache(struct kmem_cache *s)
return !s->memcg_params || s->memcg_params->is_root_cache;
}
-static inline bool cache_match_memcg(struct kmem_cache *cachep,
- struct mem_cgroup *memcg)
-{
- return (is_root_cache(cachep) && !memcg) ||
- (cachep->memcg_params->memcg == memcg);
-}
-
-static inline void memcg_bind_pages(struct kmem_cache *s, int order)
-{
- if (!is_root_cache(s))
- atomic_add(1 << order, &s->memcg_params->nr_pages);
-}
-
-static inline void memcg_release_pages(struct kmem_cache *s, int order)
-{
- if (is_root_cache(s))
- return;
-
- if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
- mem_cgroup_destroy_cache(s);
-}
-
static inline bool slab_equal_or_root(struct kmem_cache *s,
struct kmem_cache *p)
{
@@ -160,12 +140,36 @@ static inline const char *cache_name(struct kmem_cache *s)
return s->name;
}
+/*
+ * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
+ * That said the caller must assure the memcg's cache won't go away. Since once
+ * created a memcg's cache is destroyed only along with the root cache, it is
+ * true if we are going to allocate from the cache or hold a reference to the
+ * root cache by other means. Otherwise, we should hold either the slab_mutex
+ * or the memcg's slab_caches_mutex while calling this function and accessing
+ * the returned value.
+ */
static inline struct kmem_cache *
cache_from_memcg_idx(struct kmem_cache *s, int idx)
{
+ struct kmem_cache *cachep;
+ struct memcg_cache_params *params;
+
if (!s->memcg_params)
return NULL;
- return s->memcg_params->memcg_caches[idx];
+
+ rcu_read_lock();
+ params = rcu_dereference(s->memcg_params);
+ cachep = params->memcg_caches[idx];
+ rcu_read_unlock();
+
+ /*
+ * Make sure we will access the up-to-date value. The code updating
+ * memcg_caches issues a write barrier to match this (see
+ * memcg_register_cache()).
+ */
+ smp_read_barrier_depends();
+ return cachep;
}
static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
@@ -174,24 +178,29 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
return s;
return s->memcg_params->root_cache;
}
-#else
-static inline bool is_root_cache(struct kmem_cache *s)
-{
- return true;
-}
-static inline bool cache_match_memcg(struct kmem_cache *cachep,
- struct mem_cgroup *memcg)
+static __always_inline int memcg_charge_slab(struct kmem_cache *s,
+ gfp_t gfp, int order)
{
- return true;
+ if (!memcg_kmem_enabled())
+ return 0;
+ if (is_root_cache(s))
+ return 0;
+ return __memcg_charge_slab(s, gfp, order);
}
-static inline void memcg_bind_pages(struct kmem_cache *s, int order)
+static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
{
+ if (!memcg_kmem_enabled())
+ return;
+ if (is_root_cache(s))
+ return;
+ __memcg_uncharge_slab(s, order);
}
-
-static inline void memcg_release_pages(struct kmem_cache *s, int order)
+#else
+static inline bool is_root_cache(struct kmem_cache *s)
{
+ return true;
}
static inline bool slab_equal_or_root(struct kmem_cache *s,
@@ -215,6 +224,15 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
{
return s;
}
+
+static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
+{
+ return 0;
+}
+
+static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
+{
+}
#endif
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 0b7bb399b0e..d31c4bacc6a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -29,8 +29,7 @@ DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
#ifdef CONFIG_DEBUG_VM
-static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
- size_t size)
+static int kmem_cache_sanity_check(const char *name, size_t size)
{
struct kmem_cache *s = NULL;
@@ -56,14 +55,8 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
continue;
}
-#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
- /*
- * For simplicity, we won't check this in the list of memcg
- * caches. We have control over memcg naming, and if there
- * aren't duplicates in the global list, there won't be any
- * duplicates in the memcg lists as well.
- */
- if (!memcg && !strcmp(s->name, name)) {
+#if !defined(CONFIG_SLUB)
+ if (!strcmp(s->name, name)) {
pr_err("%s (%s): Cache name already exists.\n",
__func__, name);
dump_stack();
@@ -77,8 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
return 0;
}
#else
-static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
- const char *name, size_t size)
+static inline int kmem_cache_sanity_check(const char *name, size_t size)
{
return 0;
}
@@ -139,6 +131,45 @@ unsigned long calculate_alignment(unsigned long flags,
return ALIGN(align, sizeof(void *));
}
+static struct kmem_cache *
+do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *),
+ struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+ struct kmem_cache *s;
+ int err;
+
+ err = -ENOMEM;
+ s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
+ if (!s)
+ goto out;
+
+ s->name = name;
+ s->object_size = object_size;
+ s->size = size;
+ s->align = align;
+ s->ctor = ctor;
+
+ err = memcg_alloc_cache_params(memcg, s, root_cache);
+ if (err)
+ goto out_free_cache;
+
+ err = __kmem_cache_create(s, flags);
+ if (err)
+ goto out_free_cache;
+
+ s->refcount = 1;
+ list_add(&s->list, &slab_caches);
+out:
+ if (err)
+ return ERR_PTR(err);
+ return s;
+
+out_free_cache:
+ memcg_free_cache_params(s);
+ kfree(s);
+ goto out;
+}
/*
* kmem_cache_create - Create a cache.
@@ -164,20 +195,22 @@ unsigned long calculate_alignment(unsigned long flags,
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
*/
-
struct kmem_cache *
-kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *),
- struct kmem_cache *parent_cache)
+kmem_cache_create(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
{
- struct kmem_cache *s = NULL;
- int err = 0;
+ struct kmem_cache *s;
+ char *cache_name;
+ int err;
get_online_cpus();
+ get_online_mems();
+
mutex_lock(&slab_mutex);
- if (!kmem_cache_sanity_check(memcg, name, size) == 0)
- goto out_locked;
+ err = kmem_cache_sanity_check(name, size);
+ if (err)
+ goto out_unlock;
/*
* Some allocators will constraint the set of valid flags to a subset
@@ -187,47 +220,31 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
*/
flags &= CACHE_CREATE_MASK;
- s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
+ s = __kmem_cache_alias(name, size, align, flags, ctor);
if (s)
- goto out_locked;
-
- s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
- if (s) {
- s->object_size = s->size = size;
- s->align = calculate_alignment(flags, align, size);
- s->ctor = ctor;
-
- if (memcg_register_cache(memcg, s, parent_cache)) {
- kmem_cache_free(kmem_cache, s);
- err = -ENOMEM;
- goto out_locked;
- }
-
- s->name = kstrdup(name, GFP_KERNEL);
- if (!s->name) {
- kmem_cache_free(kmem_cache, s);
- err = -ENOMEM;
- goto out_locked;
- }
+ goto out_unlock;
- err = __kmem_cache_create(s, flags);
- if (!err) {
- s->refcount = 1;
- list_add(&s->list, &slab_caches);
- memcg_cache_list_add(memcg, s);
- } else {
- kfree(s->name);
- kmem_cache_free(kmem_cache, s);
- }
- } else
+ cache_name = kstrdup(name, GFP_KERNEL);
+ if (!cache_name) {
err = -ENOMEM;
+ goto out_unlock;
+ }
-out_locked:
+ s = do_kmem_cache_create(cache_name, size, size,
+ calculate_alignment(flags, align, size),
+ flags, ctor, NULL, NULL);
+ if (IS_ERR(s)) {
+ err = PTR_ERR(s);
+ kfree(cache_name);
+ }
+
+out_unlock:
mutex_unlock(&slab_mutex);
+
+ put_online_mems();
put_online_cpus();
if (err) {
-
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
name, err);
@@ -236,54 +253,148 @@ out_locked:
name, err);
dump_stack();
}
-
return NULL;
}
+ return s;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * memcg_create_kmem_cache - Create a cache for a memory cgroup.
+ * @memcg: The memory cgroup the new cache is for.
+ * @root_cache: The parent of the new cache.
+ * @memcg_name: The name of the memory cgroup (used for naming the new cache).
+ *
+ * This function attempts to create a kmem cache that will serve allocation
+ * requests going from @memcg to @root_cache. The new cache inherits properties
+ * from its parent.
+ */
+struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *root_cache,
+ const char *memcg_name)
+{
+ struct kmem_cache *s = NULL;
+ char *cache_name;
+
+ get_online_cpus();
+ get_online_mems();
+
+ mutex_lock(&slab_mutex);
+
+ cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
+ memcg_cache_id(memcg), memcg_name);
+ if (!cache_name)
+ goto out_unlock;
+
+ s = do_kmem_cache_create(cache_name, root_cache->object_size,
+ root_cache->size, root_cache->align,
+ root_cache->flags, root_cache->ctor,
+ memcg, root_cache);
+ if (IS_ERR(s)) {
+ kfree(cache_name);
+ s = NULL;
+ }
+
+out_unlock:
+ mutex_unlock(&slab_mutex);
+
+ put_online_mems();
+ put_online_cpus();
return s;
}
-struct kmem_cache *
-kmem_cache_create(const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *))
+static int memcg_cleanup_cache_params(struct kmem_cache *s)
+{
+ int rc;
+
+ if (!s->memcg_params ||
+ !s->memcg_params->is_root_cache)
+ return 0;
+
+ mutex_unlock(&slab_mutex);
+ rc = __memcg_cleanup_cache_params(s);
+ mutex_lock(&slab_mutex);
+
+ return rc;
+}
+#else
+static int memcg_cleanup_cache_params(struct kmem_cache *s)
{
- return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
+ return 0;
}
-EXPORT_SYMBOL(kmem_cache_create);
+#endif /* CONFIG_MEMCG_KMEM */
-void kmem_cache_destroy(struct kmem_cache *s)
+void slab_kmem_cache_release(struct kmem_cache *s)
{
- /* Destroy all the children caches if we aren't a memcg cache */
- kmem_cache_destroy_memcg_children(s);
+ kfree(s->name);
+ kmem_cache_free(kmem_cache, s);
+}
+void kmem_cache_destroy(struct kmem_cache *s)
+{
get_online_cpus();
+ get_online_mems();
+
mutex_lock(&slab_mutex);
+
s->refcount--;
- if (!s->refcount) {
- list_del(&s->list);
-
- if (!__kmem_cache_shutdown(s)) {
- mutex_unlock(&slab_mutex);
- if (s->flags & SLAB_DESTROY_BY_RCU)
- rcu_barrier();
-
- memcg_release_cache(s);
- kfree(s->name);
- kmem_cache_free(kmem_cache, s);
- } else {
- list_add(&s->list, &slab_caches);
- mutex_unlock(&slab_mutex);
- printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
- s->name);
- dump_stack();
- }
- } else {
- mutex_unlock(&slab_mutex);
+ if (s->refcount)
+ goto out_unlock;
+
+ if (memcg_cleanup_cache_params(s) != 0)
+ goto out_unlock;
+
+ if (__kmem_cache_shutdown(s) != 0) {
+ printk(KERN_ERR "kmem_cache_destroy %s: "
+ "Slab cache still has objects\n", s->name);
+ dump_stack();
+ goto out_unlock;
}
+
+ list_del(&s->list);
+
+ mutex_unlock(&slab_mutex);
+ if (s->flags & SLAB_DESTROY_BY_RCU)
+ rcu_barrier();
+
+ memcg_free_cache_params(s);
+#ifdef SLAB_SUPPORTS_SYSFS
+ sysfs_slab_remove(s);
+#else
+ slab_kmem_cache_release(s);
+#endif
+ goto out;
+
+out_unlock:
+ mutex_unlock(&slab_mutex);
+out:
+ put_online_mems();
put_online_cpus();
}
EXPORT_SYMBOL(kmem_cache_destroy);
+/**
+ * kmem_cache_shrink - Shrink a cache.
+ * @cachep: The cache to shrink.
+ *
+ * Releases as many slabs as possible for a cache.
+ * To help debugging, a zero exit status indicates all slabs were released.
+ */
+int kmem_cache_shrink(struct kmem_cache *cachep)
+{
+ int ret;
+
+ get_online_cpus();
+ get_online_mems();
+ ret = __kmem_cache_shrink(cachep);
+ put_online_mems();
+ put_online_cpus();
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
int slab_is_available(void)
{
return slab_state >= UP;
@@ -498,6 +609,24 @@ void __init create_kmalloc_caches(unsigned long flags)
}
#endif /* !CONFIG_SLOB */
+/*
+ * To avoid unnecessary overhead, we pass through large allocation requests
+ * directly to the page allocator. We use __GFP_COMP, because we will need to
+ * know the allocation order to free the pages properly in kfree.
+ */
+void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
+{
+ void *ret;
+ struct page *page;
+
+ flags |= __GFP_COMP;
+ page = alloc_kmem_pages(flags, order);
+ ret = page ? page_address(page) : NULL;
+ kmemleak_alloc(ret, size, 1, flags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_order);
+
#ifdef CONFIG_TRACING
void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
{
diff --git a/mm/slob.c b/mm/slob.c
index 4bf8809dfcc..21980e0f39a 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -111,13 +111,13 @@ static inline int slob_page_free(struct page *sp)
static void set_slob_page_free(struct page *sp, struct list_head *list)
{
- list_add(&sp->list, list);
+ list_add(&sp->lru, list);
__SetPageSlobFree(sp);
}
static inline void clear_slob_page_free(struct page *sp)
{
- list_del(&sp->list);
+ list_del(&sp->lru);
__ClearPageSlobFree(sp);
}
@@ -282,7 +282,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
spin_lock_irqsave(&slob_lock, flags);
/* Iterate through each partially free page, try to find room */
- list_for_each_entry(sp, slob_list, list) {
+ list_for_each_entry(sp, slob_list, lru) {
#ifdef CONFIG_NUMA
/*
* If there's a node specification, search for a partial
@@ -296,7 +296,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
continue;
/* Attempt to alloc */
- prev = sp->list.prev;
+ prev = sp->lru.prev;
b = slob_page_alloc(sp, size, align);
if (!b)
continue;
@@ -322,7 +322,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
spin_lock_irqsave(&slob_lock, flags);
sp->units = SLOB_UNITS(PAGE_SIZE);
sp->freelist = b;
- INIT_LIST_HEAD(&sp->list);
+ INIT_LIST_HEAD(&sp->lru);
set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
set_slob_page_free(sp, slob_list);
b = slob_page_alloc(sp, size, align);
@@ -620,11 +620,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
return 0;
}
-int kmem_cache_shrink(struct kmem_cache *d)
+int __kmem_cache_shrink(struct kmem_cache *d)
{
return 0;
}
-EXPORT_SYMBOL(kmem_cache_shrink);
struct kmem_cache kmem_cache_boot = {
.name = "kmem_cache",
diff --git a/mm/slub.c b/mm/slub.c
index 545a170ebf9..73004808537 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -210,21 +210,22 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
#ifdef CONFIG_SYSFS
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
-static void sysfs_slab_remove(struct kmem_cache *);
static void memcg_propagate_slab_attrs(struct kmem_cache *s);
#else
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
-static inline void sysfs_slab_remove(struct kmem_cache *s) { }
-
static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
#endif
static inline void stat(const struct kmem_cache *s, enum stat_item si)
{
#ifdef CONFIG_SLUB_STATS
- __this_cpu_inc(s->cpu_slab->stat[si]);
+ /*
+ * The rmw is racy on a preemptible kernel but this is acceptable, so
+ * avoid this_cpu_add()'s irq-disable overhead.
+ */
+ raw_cpu_inc(s->cpu_slab->stat[si]);
#endif
}
@@ -355,6 +356,21 @@ static __always_inline void slab_unlock(struct page *page)
__bit_spin_unlock(PG_locked, &page->flags);
}
+static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
+{
+ struct page tmp;
+ tmp.counters = counters_new;
+ /*
+ * page->counters can cover frozen/inuse/objects as well
+ * as page->_count. If we assign to ->counters directly
+ * we run the risk of losing updates to page->_count, so
+ * be careful and only assign to the fields we need.
+ */
+ page->frozen = tmp.frozen;
+ page->inuse = tmp.inuse;
+ page->objects = tmp.objects;
+}
+
/* Interrupts must be disabled (for the fallback code to work right) */
static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
void *freelist_old, unsigned long counters_old,
@@ -376,7 +392,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
if (page->freelist == freelist_old &&
page->counters == counters_old) {
page->freelist = freelist_new;
- page->counters = counters_new;
+ set_page_slub_counters(page, counters_new);
slab_unlock(page);
return 1;
}
@@ -387,7 +403,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
stat(s, CMPXCHG_DOUBLE_FAIL);
#ifdef SLUB_DEBUG_CMPXCHG
- printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
+ pr_info("%s %s: cmpxchg double redo ", n, s->name);
#endif
return 0;
@@ -415,7 +431,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
if (page->freelist == freelist_old &&
page->counters == counters_old) {
page->freelist = freelist_new;
- page->counters = counters_new;
+ set_page_slub_counters(page, counters_new);
slab_unlock(page);
local_irq_restore(flags);
return 1;
@@ -428,7 +444,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
stat(s, CMPXCHG_DOUBLE_FAIL);
#ifdef SLUB_DEBUG_CMPXCHG
- printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
+ pr_info("%s %s: cmpxchg double redo ", n, s->name);
#endif
return 0;
@@ -530,14 +546,14 @@ static void print_track(const char *s, struct track *t)
if (!t->addr)
return;
- printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
- s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
+ pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+ s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
#ifdef CONFIG_STACKTRACE
{
int i;
for (i = 0; i < TRACK_ADDRS_COUNT; i++)
if (t->addrs[i])
- printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
+ pr_err("\t%pS\n", (void *)t->addrs[i]);
else
break;
}
@@ -555,38 +571,37 @@ static void print_tracking(struct kmem_cache *s, void *object)
static void print_page_info(struct page *page)
{
- printk(KERN_ERR
- "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
+ pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n",
page, page->objects, page->inuse, page->freelist, page->flags);
}
static void slab_bug(struct kmem_cache *s, char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
- char buf[100];
va_start(args, fmt);
- vsnprintf(buf, sizeof(buf), fmt, args);
- va_end(args);
- printk(KERN_ERR "========================================"
- "=====================================\n");
- printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
- printk(KERN_ERR "----------------------------------------"
- "-------------------------------------\n\n");
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_err("=============================================================================\n");
+ pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf);
+ pr_err("-----------------------------------------------------------------------------\n\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+ va_end(args);
}
static void slab_fix(struct kmem_cache *s, char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
- char buf[100];
va_start(args, fmt);
- vsnprintf(buf, sizeof(buf), fmt, args);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_err("FIX %s: %pV\n", s->name, &vaf);
va_end(args);
- printk(KERN_ERR "FIX %s: %s\n", s->name, buf);
}
static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
@@ -598,8 +613,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
print_page_info(page);
- printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
- p, p - addr, get_freepointer(s, p));
+ pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
+ p, p - addr, get_freepointer(s, p));
if (p > addr + 16)
print_section("Bytes b4 ", p - 16, 16);
@@ -682,7 +697,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
end--;
slab_bug(s, "%s overwritten", what);
- printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
+ pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n",
fault, end - 1, fault[0], value);
print_trailer(s, page, object);
@@ -915,7 +930,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
int alloc)
{
if (s->flags & SLAB_TRACE) {
- printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
+ pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
s->name,
alloc ? "alloc" : "free",
object, page->inuse,
@@ -985,8 +1000,6 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
/*
* Tracking of fully allocated slabs for debugging purposes.
- *
- * list_lock must be held.
*/
static void add_full(struct kmem_cache *s,
struct kmem_cache_node *n, struct page *page)
@@ -994,17 +1007,16 @@ static void add_full(struct kmem_cache *s,
if (!(s->flags & SLAB_STORE_USER))
return;
+ lockdep_assert_held(&n->list_lock);
list_add(&page->lru, &n->full);
}
-/*
- * list_lock must be held.
- */
-static void remove_full(struct kmem_cache *s, struct page *page)
+static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
{
if (!(s->flags & SLAB_STORE_USER))
return;
+ lockdep_assert_held(&n->list_lock);
list_del(&page->lru);
}
@@ -1121,9 +1133,8 @@ static noinline struct kmem_cache_node *free_debug_processing(
slab_err(s, page, "Attempt to free object(0x%p) "
"outside of slab", object);
} else if (!page->slab_cache) {
- printk(KERN_ERR
- "SLUB <none>: no slab for object 0x%p.\n",
- object);
+ pr_err("SLUB <none>: no slab for object 0x%p.\n",
+ object);
dump_stack();
} else
object_err(s, page, object,
@@ -1206,8 +1217,8 @@ static int __init setup_slub_debug(char *str)
slub_debug |= SLAB_FAILSLAB;
break;
default:
- printk(KERN_ERR "slub_debug option '%c' "
- "unknown. skipped\n", *str);
+ pr_err("slub_debug option '%c' unknown. skipped\n",
+ *str);
}
}
@@ -1250,7 +1261,8 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
void *object, u8 val) { return 1; }
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct page *page) {}
-static inline void remove_full(struct kmem_cache *s, struct page *page) {}
+static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
+ struct page *page) {}
static inline unsigned long kmem_cache_flags(unsigned long object_size,
unsigned long flags, const char *name,
void (*ctor)(void *))
@@ -1300,17 +1312,26 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
/*
* Slab allocation and freeing
*/
-static inline struct page *alloc_slab_page(gfp_t flags, int node,
- struct kmem_cache_order_objects oo)
+static inline struct page *alloc_slab_page(struct kmem_cache *s,
+ gfp_t flags, int node, struct kmem_cache_order_objects oo)
{
+ struct page *page;
int order = oo_order(oo);
flags |= __GFP_NOTRACK;
+ if (memcg_charge_slab(s, flags, order))
+ return NULL;
+
if (node == NUMA_NO_NODE)
- return alloc_pages(flags, order);
+ page = alloc_pages(flags, order);
else
- return alloc_pages_exact_node(node, flags, order);
+ page = alloc_pages_exact_node(node, flags, order);
+
+ if (!page)
+ memcg_uncharge_slab(s, order);
+
+ return page;
}
static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1332,14 +1353,15 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
*/
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
- page = alloc_slab_page(alloc_gfp, node, oo);
+ page = alloc_slab_page(s, alloc_gfp, node, oo);
if (unlikely(!page)) {
oo = s->min;
+ alloc_gfp = flags;
/*
* Allocation may have failed due to fragmentation.
* Try a lower order alloc if possible
*/
- page = alloc_slab_page(flags, node, oo);
+ page = alloc_slab_page(s, alloc_gfp, node, oo);
if (page)
stat(s, ORDER_FALLBACK);
@@ -1349,7 +1371,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
int pages = 1 << oo_order(oo);
- kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
+ kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
/*
* Objects from caches that have a constructor don't get
@@ -1400,7 +1422,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
order = compound_order(page);
inc_slabs_node(s, page_to_nid(page), page->objects);
- memcg_bind_pages(s, order);
page->slab_cache = s;
__SetPageSlab(page);
if (page->pfmemalloc)
@@ -1451,11 +1472,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
- memcg_release_pages(s, order);
page_mapcount_reset(page);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- __free_memcg_kmem_pages(page, order);
+ __free_pages(page, order);
+ memcg_uncharge_slab(s, order);
}
#define need_reserve_slab_rcu \
@@ -1504,11 +1525,9 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
/*
* Management of partially allocated slabs.
- *
- * list_lock must be held.
*/
-static inline void add_partial(struct kmem_cache_node *n,
- struct page *page, int tail)
+static inline void
+__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
{
n->nr_partial++;
if (tail == DEACTIVATE_TO_TAIL)
@@ -1517,23 +1536,32 @@ static inline void add_partial(struct kmem_cache_node *n,
list_add(&page->lru, &n->partial);
}
-/*
- * list_lock must be held.
- */
-static inline void remove_partial(struct kmem_cache_node *n,
- struct page *page)
+static inline void add_partial(struct kmem_cache_node *n,
+ struct page *page, int tail)
+{
+ lockdep_assert_held(&n->list_lock);
+ __add_partial(n, page, tail);
+}
+
+static inline void
+__remove_partial(struct kmem_cache_node *n, struct page *page)
{
list_del(&page->lru);
n->nr_partial--;
}
+static inline void remove_partial(struct kmem_cache_node *n,
+ struct page *page)
+{
+ lockdep_assert_held(&n->list_lock);
+ __remove_partial(n, page);
+}
+
/*
* Remove slab from the partial list, freeze it and
* return the pointer to the freelist.
*
* Returns a list of objects or NULL if it fails.
- *
- * Must hold list_lock since we modify the partial list.
*/
static inline void *acquire_slab(struct kmem_cache *s,
struct kmem_cache_node *n, struct page *page,
@@ -1543,6 +1571,8 @@ static inline void *acquire_slab(struct kmem_cache *s,
unsigned long counters;
struct page new;
+ lockdep_assert_held(&n->list_lock);
+
/*
* Zap the freelist and set the frozen bit.
* The old freelist is the list of objects for the
@@ -1662,8 +1692,8 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
return NULL;
do {
- cpuset_mems_cookie = get_mems_allowed();
- zonelist = node_zonelist(slab_node(), flags);
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist = node_zonelist(mempolicy_slab_node(), flags);
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
struct kmem_cache_node *n;
@@ -1674,19 +1704,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
object = get_partial_node(s, n, c, flags);
if (object) {
/*
- * Return the object even if
- * put_mems_allowed indicated that
- * the cpuset mems_allowed was
- * updated in parallel. It's a
- * harmless race between the alloc
- * and the cpuset update.
+ * Don't check read_mems_allowed_retry()
+ * here - if mems_allowed was updated in
+ * parallel, that was a harmless race
+ * between allocation and the cpuset
+ * update
*/
- put_mems_allowed(cpuset_mems_cookie);
return object;
}
}
}
- } while (!put_mems_allowed(cpuset_mems_cookie));
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
#endif
return NULL;
}
@@ -1698,7 +1726,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
struct kmem_cache_cpu *c)
{
void *object;
- int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
+ int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node;
object = get_partial_node(s, get_node(s, searchnode), c, flags);
if (object || node != NUMA_NO_NODE)
@@ -1748,19 +1776,19 @@ static inline void note_cmpxchg_failure(const char *n,
#ifdef SLUB_DEBUG_CMPXCHG
unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
- printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
+ pr_info("%s %s: cmpxchg redo ", n, s->name);
#ifdef CONFIG_PREEMPT
if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
- printk("due to cpu change %d -> %d\n",
+ pr_warn("due to cpu change %d -> %d\n",
tid_to_cpu(tid), tid_to_cpu(actual_tid));
else
#endif
if (tid_to_event(tid) != tid_to_event(actual_tid))
- printk("due to cpu running other code. Event %ld->%ld\n",
+ pr_warn("due to cpu running other code. Event %ld->%ld\n",
tid_to_event(tid), tid_to_event(actual_tid));
else
- printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
+ pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
actual_tid, tid, next_tid(tid));
#endif
stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
@@ -1853,7 +1881,7 @@ redo:
new.frozen = 0;
- if (!new.inuse && n->nr_partial > s->min_partial)
+ if (!new.inuse && n->nr_partial >= s->min_partial)
m = M_FREE;
else if (new.freelist) {
m = M_PARTIAL;
@@ -1887,7 +1915,7 @@ redo:
else if (l == M_FULL)
- remove_full(s, page);
+ remove_full(s, n, page);
if (m == M_PARTIAL) {
@@ -1964,7 +1992,7 @@ static void unfreeze_partials(struct kmem_cache *s,
new.freelist, new.counters,
"unfreezing slab"));
- if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
+ if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
page->next = discard_page;
discard_page = page;
} else {
@@ -2099,11 +2127,19 @@ static inline int node_match(struct page *page, int node)
return 1;
}
+#ifdef CONFIG_SLUB_DEBUG
static int count_free(struct page *page)
{
return page->objects - page->inuse;
}
+static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
+{
+ return atomic_long_read(&n->total_objects);
+}
+#endif /* CONFIG_SLUB_DEBUG */
+
+#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
static unsigned long count_partial(struct kmem_cache_node *n,
int (*get_count)(struct page *))
{
@@ -2117,31 +2153,28 @@ static unsigned long count_partial(struct kmem_cache_node *n,
spin_unlock_irqrestore(&n->list_lock, flags);
return x;
}
-
-static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
-{
-#ifdef CONFIG_SLUB_DEBUG
- return atomic_long_read(&n->total_objects);
-#else
- return 0;
-#endif
-}
+#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */
static noinline void
slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
{
+#ifdef CONFIG_SLUB_DEBUG
+ static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
int node;
- printk(KERN_WARNING
- "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
+ if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
+ return;
+
+ pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
nid, gfpflags);
- printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, "
- "default order: %d, min order: %d\n", s->name, s->object_size,
- s->size, oo_order(s->oo), oo_order(s->min));
+ pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
+ s->name, s->object_size, s->size, oo_order(s->oo),
+ oo_order(s->min));
if (oo_order(s->min) > get_order(s->object_size))
- printk(KERN_WARNING " %s debugging increased min order, use "
- "slub_debug=O to disable.\n", s->name);
+ pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n",
+ s->name);
for_each_online_node(node) {
struct kmem_cache_node *n = get_node(s, node);
@@ -2156,10 +2189,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
nr_slabs = node_nr_slabs(n);
nr_objs = node_nr_objs(n);
- printk(KERN_WARNING
- " node %d: slabs: %ld, objs: %ld, free: %ld\n",
+ pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n",
node, nr_slabs, nr_objs, nr_free);
}
+#endif
}
static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
@@ -2176,7 +2209,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
page = new_slab(s, flags, node);
if (page) {
- c = __this_cpu_ptr(s->cpu_slab);
+ c = raw_cpu_ptr(s->cpu_slab);
if (c->page)
flush_slab(s, c);
@@ -2301,8 +2334,6 @@ redo:
if (freelist)
goto load_freelist;
- stat(s, ALLOC_SLOWPATH);
-
freelist = get_freelist(s, page);
if (!freelist) {
@@ -2338,9 +2369,7 @@ new_slab:
freelist = new_slab_objects(s, gfpflags, node, &c);
if (unlikely(!freelist)) {
- if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
- slab_out_of_memory(s, gfpflags, node);
-
+ slab_out_of_memory(s, gfpflags, node);
local_irq_restore(flags);
return NULL;
}
@@ -2396,7 +2425,7 @@ redo:
* and the retrieval of the tid.
*/
preempt_disable();
- c = __this_cpu_ptr(s->cpu_slab);
+ c = this_cpu_ptr(s->cpu_slab);
/*
* The transaction ids are globally unique per cpu and per operation on
@@ -2409,10 +2438,10 @@ redo:
object = c->freelist;
page = c->page;
- if (unlikely(!object || !node_match(page, node)))
+ if (unlikely(!object || !node_match(page, node))) {
object = __slab_alloc(s, gfpflags, node, addr, c);
-
- else {
+ stat(s, ALLOC_SLOWPATH);
+ } else {
void *next_object = get_freepointer_safe(s, object);
/*
@@ -2541,7 +2570,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
new.inuse--;
if ((!new.inuse || !prior) && !was_frozen) {
- if (kmem_cache_has_cpu_partial(s) && !prior)
+ if (kmem_cache_has_cpu_partial(s) && !prior) {
/*
* Slab was on no list before and will be
@@ -2551,7 +2580,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
*/
new.frozen = 1;
- else { /* Needs to be taken off a list */
+ } else { /* Needs to be taken off a list */
n = get_node(s, page_to_nid(page));
/*
@@ -2591,7 +2620,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
return;
}
- if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
+ if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
goto slab_empty;
/*
@@ -2600,7 +2629,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
*/
if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
if (kmem_cache_debug(s))
- remove_full(s, page);
+ remove_full(s, n, page);
add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
@@ -2614,9 +2643,10 @@ slab_empty:
*/
remove_partial(n, page);
stat(s, FREE_REMOVE_PARTIAL);
- } else
+ } else {
/* Slab must be on the full list */
- remove_full(s, page);
+ remove_full(s, n, page);
+ }
spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, FREE_SLAB);
@@ -2651,7 +2681,7 @@ redo:
* during the cmpxchg then the free will succedd.
*/
preempt_disable();
- c = __this_cpu_ptr(s->cpu_slab);
+ c = this_cpu_ptr(s->cpu_slab);
tid = c->tid;
preempt_enable();
@@ -2871,10 +2901,8 @@ static void early_kmem_cache_node_alloc(int node)
BUG_ON(!page);
if (page_to_nid(page) != node) {
- printk(KERN_ERR "SLUB: Unable to allocate memory from "
- "node %d\n", node);
- printk(KERN_ERR "SLUB: Allocating a useless per node structure "
- "in order to be able to continue\n");
+ pr_err("SLUB: Unable to allocate memory from node %d\n", node);
+ pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
}
n = page->freelist;
@@ -2890,7 +2918,11 @@ static void early_kmem_cache_node_alloc(int node)
init_kmem_cache_node(n);
inc_slabs_node(kmem_cache_node, node, page->objects);
- add_partial(n, page, DEACTIVATE_TO_HEAD);
+ /*
+ * No locks need to be taken here as it has just been
+ * initialized and there is no concurrent access.
+ */
+ __add_partial(n, page, DEACTIVATE_TO_HEAD);
}
static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -3155,8 +3187,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
for_each_object(p, s, addr, page->objects) {
if (!test_bit(slab_index(p, s, addr), map)) {
- printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n",
- p, p - addr);
+ pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
@@ -3176,7 +3207,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
- remove_partial(n, page);
+ __remove_partial(n, page);
discard_slab(s, page);
} else {
list_slab_objects(s, page,
@@ -3208,23 +3239,7 @@ static inline int kmem_cache_close(struct kmem_cache *s)
int __kmem_cache_shutdown(struct kmem_cache *s)
{
- int rc = kmem_cache_close(s);
-
- if (!rc) {
- /*
- * We do the same lock strategy around sysfs_slab_add, see
- * __kmem_cache_create. Because this is pretty much the last
- * operation we do and the lock will be released shortly after
- * that in slab_common.c, we could just move sysfs_slab_remove
- * to a later point in common code. We should do that when we
- * have a common sysfs framework for all allocators.
- */
- mutex_unlock(&slab_mutex);
- sysfs_slab_remove(s);
- mutex_lock(&slab_mutex);
- }
-
- return rc;
+ return kmem_cache_close(s);
}
/********************************************************************
@@ -3294,8 +3309,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
struct page *page;
void *ptr = NULL;
- flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
- page = alloc_pages_node(node, flags, get_order(size));
+ flags |= __GFP_COMP | __GFP_NOTRACK;
+ page = alloc_kmem_pages_node(node, flags, get_order(size));
if (page)
ptr = page_address(page);
@@ -3364,7 +3379,7 @@ void kfree(const void *x)
if (unlikely(!PageSlab(page))) {
BUG_ON(!PageCompound(page));
kfree_hook(x);
- __free_memcg_kmem_pages(page, compound_order(page));
+ __free_kmem_pages(page, compound_order(page));
return;
}
slab_free(page->slab_cache, page, object, _RET_IP_);
@@ -3381,7 +3396,7 @@ EXPORT_SYMBOL(kfree);
* being allocated from last increasing the chance that the last objects
* are freed in them.
*/
-int kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s)
{
int node;
int i;
@@ -3437,7 +3452,6 @@ int kmem_cache_shrink(struct kmem_cache *s)
kfree(slabs_by_inuse);
return 0;
}
-EXPORT_SYMBOL(kmem_cache_shrink);
static int slab_mem_going_offline_callback(void *arg)
{
@@ -3445,7 +3459,7 @@ static int slab_mem_going_offline_callback(void *arg)
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list)
- kmem_cache_shrink(s);
+ __kmem_cache_shrink(s);
mutex_unlock(&slab_mutex);
return 0;
@@ -3639,9 +3653,7 @@ void __init kmem_cache_init(void)
register_cpu_notifier(&slab_notifier);
#endif
- printk(KERN_INFO
- "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d,"
- " CPUs=%d, Nodes=%d\n",
+ pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n",
cache_line_size(),
slub_min_order, slub_max_order, slub_min_objects,
nr_cpu_ids, nr_node_ids);
@@ -3659,6 +3671,9 @@ static int slab_unmergeable(struct kmem_cache *s)
if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
return 1;
+ if (!is_root_cache(s))
+ return 1;
+
if (s->ctor)
return 1;
@@ -3671,9 +3686,8 @@ static int slab_unmergeable(struct kmem_cache *s)
return 0;
}
-static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
- size_t align, unsigned long flags, const char *name,
- void (*ctor)(void *))
+static struct kmem_cache *find_mergeable(size_t size, size_t align,
+ unsigned long flags, const char *name, void (*ctor)(void *))
{
struct kmem_cache *s;
@@ -3696,7 +3710,7 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
continue;
if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
- continue;
+ continue;
/*
* Check if alignment is compatible.
* Courtesy of Adrian Drzewiecki
@@ -3707,23 +3721,24 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
if (s->size - size >= sizeof(void *))
continue;
- if (!cache_match_memcg(s, memcg))
- continue;
-
return s;
}
return NULL;
}
struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
- s = find_mergeable(memcg, size, align, flags, name, ctor);
+ s = find_mergeable(size, align, flags, name, ctor);
if (s) {
+ int i;
+ struct kmem_cache *c;
+
s->refcount++;
+
/*
* Adjust the object sizes so that we clear
* the complete object on kzalloc.
@@ -3731,6 +3746,15 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
s->object_size = max(s->object_size, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg_idx(s, i);
+ if (!c)
+ continue;
+ c->object_size = s->object_size;
+ c->inuse = max_t(int, c->inuse,
+ ALIGN(size, sizeof(void *)));
+ }
+
if (sysfs_slab_alias(s, name)) {
s->refcount--;
s = NULL;
@@ -3753,10 +3777,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
return 0;
memcg_propagate_slab_attrs(s);
- mutex_unlock(&slab_mutex);
err = sysfs_slab_add(s);
- mutex_lock(&slab_mutex);
-
if (err)
kmem_cache_close(s);
@@ -3914,8 +3935,8 @@ static int validate_slab_node(struct kmem_cache *s,
count++;
}
if (count != n->nr_partial)
- printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
- "counter=%ld\n", s->name, count, n->nr_partial);
+ pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
+ s->name, count, n->nr_partial);
if (!(s->flags & SLAB_STORE_USER))
goto out;
@@ -3925,9 +3946,8 @@ static int validate_slab_node(struct kmem_cache *s,
count++;
}
if (count != atomic_long_read(&n->nr_slabs))
- printk(KERN_ERR "SLUB: %s %ld slabs counted but "
- "counter=%ld\n", s->name, count,
- atomic_long_read(&n->nr_slabs));
+ pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
+ s->name, count, atomic_long_read(&n->nr_slabs));
out:
spin_unlock_irqrestore(&n->list_lock, flags);
@@ -4191,53 +4211,50 @@ static void resiliency_test(void)
BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
- printk(KERN_ERR "SLUB resiliency testing\n");
- printk(KERN_ERR "-----------------------\n");
- printk(KERN_ERR "A. Corruption after allocation\n");
+ pr_err("SLUB resiliency testing\n");
+ pr_err("-----------------------\n");
+ pr_err("A. Corruption after allocation\n");
p = kzalloc(16, GFP_KERNEL);
p[16] = 0x12;
- printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
- " 0x12->0x%p\n\n", p + 16);
+ pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
+ p + 16);
validate_slab_cache(kmalloc_caches[4]);
/* Hmmm... The next two are dangerous */
p = kzalloc(32, GFP_KERNEL);
p[32 + sizeof(void *)] = 0x34;
- printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
- " 0x34 -> -0x%p\n", p);
- printk(KERN_ERR
- "If allocated object is overwritten then not detectable\n\n");
+ pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n",
+ p);
+ pr_err("If allocated object is overwritten then not detectable\n\n");
validate_slab_cache(kmalloc_caches[5]);
p = kzalloc(64, GFP_KERNEL);
p += 64 + (get_cycles() & 0xff) * sizeof(void *);
*p = 0x56;
- printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
- p);
- printk(KERN_ERR
- "If allocated object is overwritten then not detectable\n\n");
+ pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
+ p);
+ pr_err("If allocated object is overwritten then not detectable\n\n");
validate_slab_cache(kmalloc_caches[6]);
- printk(KERN_ERR "\nB. Corruption after free\n");
+ pr_err("\nB. Corruption after free\n");
p = kzalloc(128, GFP_KERNEL);
kfree(p);
*p = 0x78;
- printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
+ pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
validate_slab_cache(kmalloc_caches[7]);
p = kzalloc(256, GFP_KERNEL);
kfree(p);
p[50] = 0x9a;
- printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
- p);
+ pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
validate_slab_cache(kmalloc_caches[8]);
p = kzalloc(512, GFP_KERNEL);
kfree(p);
p[512] = 0xab;
- printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
+ pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
validate_slab_cache(kmalloc_caches[9]);
}
#else
@@ -4299,14 +4316,20 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
page = ACCESS_ONCE(c->partial);
if (page) {
- x = page->pobjects;
+ node = page_to_nid(page);
+ if (flags & SO_TOTAL)
+ WARN_ON_ONCE(1);
+ else if (flags & SO_OBJECTS)
+ WARN_ON_ONCE(1);
+ else
+ x = page->pages;
total += x;
nodes[node] += x;
}
}
}
- lock_memory_hotplug();
+ get_online_mems();
#ifdef CONFIG_SLUB_DEBUG
if (flags & SO_ALL) {
for_each_node_state(node, N_NORMAL_MEMORY) {
@@ -4346,7 +4369,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
x += sprintf(buf + x, " N%d=%lu",
node, nodes[node]);
#endif
- unlock_memory_hotplug();
+ put_online_mems();
kfree(nodes);
return x + sprintf(buf + x, "\n");
}
@@ -5025,15 +5048,18 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
#ifdef CONFIG_MEMCG_KMEM
int i;
char *buffer = NULL;
+ struct kmem_cache *root_cache;
- if (!is_root_cache(s))
+ if (is_root_cache(s))
return;
+ root_cache = s->memcg_params->root_cache;
+
/*
* This mean this cache had no attribute written. Therefore, no point
* in copying default values around
*/
- if (!s->max_attr_size)
+ if (!root_cache->max_attr_size)
return;
for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
@@ -5055,7 +5081,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
*/
if (buffer)
buf = buffer;
- else if (s->max_attr_size < ARRAY_SIZE(mbuf))
+ else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf))
buf = mbuf;
else {
buffer = (char *) get_zeroed_page(GFP_KERNEL);
@@ -5064,7 +5090,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
buf = buffer;
}
- attr->show(s->memcg_params->root_cache, buf);
+ attr->show(root_cache, buf);
attr->store(s, buf, strlen(buf));
}
@@ -5073,6 +5099,11 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
#endif
}
+static void kmem_cache_release(struct kobject *k)
+{
+ slab_kmem_cache_release(to_slab(k));
+}
+
static const struct sysfs_ops slab_sysfs_ops = {
.show = slab_attr_show,
.store = slab_attr_store,
@@ -5080,6 +5111,7 @@ static const struct sysfs_ops slab_sysfs_ops = {
static struct kobj_type slab_ktype = {
.sysfs_ops = &slab_sysfs_ops,
+ .release = kmem_cache_release,
};
static int uevent_filter(struct kset *kset, struct kobject *kobj)
@@ -5097,6 +5129,15 @@ static const struct kset_uevent_ops slab_uevent_ops = {
static struct kset *slab_kset;
+static inline struct kset *cache_kset(struct kmem_cache *s)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ if (!is_root_cache(s))
+ return s->memcg_params->root_cache->memcg_kset;
+#endif
+ return slab_kset;
+}
+
#define ID_STR_LENGTH 64
/* Create a unique string id for a slab cache:
@@ -5162,29 +5203,42 @@ static int sysfs_slab_add(struct kmem_cache *s)
name = create_unique_id(s);
}
- s->kobj.kset = slab_kset;
- err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name);
- if (err) {
- kobject_put(&s->kobj);
- return err;
- }
+ s->kobj.kset = cache_kset(s);
+ err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
+ if (err)
+ goto out_put_kobj;
err = sysfs_create_group(&s->kobj, &slab_attr_group);
- if (err) {
- kobject_del(&s->kobj);
- kobject_put(&s->kobj);
- return err;
+ if (err)
+ goto out_del_kobj;
+
+#ifdef CONFIG_MEMCG_KMEM
+ if (is_root_cache(s)) {
+ s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
+ if (!s->memcg_kset) {
+ err = -ENOMEM;
+ goto out_del_kobj;
+ }
}
+#endif
+
kobject_uevent(&s->kobj, KOBJ_ADD);
if (!unmergeable) {
/* Setup first alias */
sysfs_slab_alias(s, s->name);
- kfree(name);
}
- return 0;
+out:
+ if (!unmergeable)
+ kfree(name);
+ return err;
+out_del_kobj:
+ kobject_del(&s->kobj);
+out_put_kobj:
+ kobject_put(&s->kobj);
+ goto out;
}
-static void sysfs_slab_remove(struct kmem_cache *s)
+void sysfs_slab_remove(struct kmem_cache *s)
{
if (slab_state < FULL)
/*
@@ -5193,6 +5247,9 @@ static void sysfs_slab_remove(struct kmem_cache *s)
*/
return;
+#ifdef CONFIG_MEMCG_KMEM
+ kset_unregister(s->memcg_kset);
+#endif
kobject_uevent(&s->kobj, KOBJ_REMOVE);
kobject_del(&s->kobj);
kobject_put(&s->kobj);
@@ -5243,7 +5300,7 @@ static int __init slab_sysfs_init(void)
slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
if (!slab_kset) {
mutex_unlock(&slab_mutex);
- printk(KERN_ERR "Cannot register slab subsystem.\n");
+ pr_err("Cannot register slab subsystem.\n");
return -ENOSYS;
}
@@ -5252,8 +5309,8 @@ static int __init slab_sysfs_init(void)
list_for_each_entry(s, &slab_caches, list) {
err = sysfs_slab_add(s);
if (err)
- printk(KERN_ERR "SLUB: Unable to add boot slab %s"
- " to sysfs\n", s->name);
+ pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
+ s->name);
}
while (alias_list) {
@@ -5262,8 +5319,8 @@ static int __init slab_sysfs_init(void)
alias_list = alias_list->next;
err = sysfs_slab_alias(al->s, al->name);
if (err)
- printk(KERN_ERR "SLUB: Unable to add boot slab alias"
- " %s to sysfs\n", al->name);
+ pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
+ al->name);
kfree(al);
}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 27eeab3be75..4cba9c2783a 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
unsigned long align,
unsigned long goal)
{
- return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
+ return memblock_virt_alloc_try_nid(size, align, goal,
+ BOOTMEM_ALLOC_ACCESSIBLE, node);
}
static void *vmemmap_buf;
@@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (vmemmap_buf_start) {
/* need to free left buf */
- free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
+ memblock_free_early(__pa(vmemmap_buf),
+ vmemmap_buf_end - vmemmap_buf);
vmemmap_buf = NULL;
vmemmap_buf_end = NULL;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 8cc7be0e959..d1b48b691ac 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -5,10 +5,12 @@
#include <linux/slab.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
+#include <linux/compiler.h>
#include <linux/highmem.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
+
#include "internal.h"
#include <asm/dma.h>
#include <asm/pgalloc.h>
@@ -69,7 +71,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
else
section = kzalloc(array_size, GFP_KERNEL);
} else {
- section = alloc_bootmem_node(NODE_DATA(nid), array_size);
+ section = memblock_virt_alloc_node(array_size, nid);
}
return section;
@@ -268,7 +270,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
/*
* A page may contain usemaps for other sections preventing the
* page being freed and making a section unremovable while
- * other sections referencing the usemap retmain active. Similarly,
+ * other sections referencing the usemap remain active. Similarly,
* a pgdat can prevent a section being removed. If section A
* contains a pgdat and section B contains the usemap, both
* sections become inter-dependent. This allocates usemaps
@@ -279,8 +281,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
- p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
- SMP_CACHE_BYTES, goal, limit);
+ p = memblock_virt_alloc_try_nid_nopanic(size,
+ SMP_CACHE_BYTES, goal, limit,
+ nid);
if (!p && limit) {
limit = 0;
goto again;
@@ -331,7 +334,7 @@ static unsigned long * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
{
- return alloc_bootmem_node_nopanic(pgdat, size);
+ return memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
}
static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -376,8 +379,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
return map;
size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
- map = __alloc_bootmem_node_high(NODE_DATA(nid), size,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ map = memblock_virt_alloc_try_nid(size,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
return map;
}
void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -401,8 +405,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
}
size = PAGE_ALIGN(size);
- map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ map = memblock_virt_alloc_try_nid(size * map_count,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
if (map) {
for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
if (!present_section_nr(pnum))
@@ -458,7 +463,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
}
#endif
-void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
+void __weak __meminit vmemmap_populate_print_last(void)
{
}
@@ -545,7 +550,7 @@ void __init sparse_init(void)
* sparse_early_mem_map_alloc, so allocate usemap_map at first.
*/
size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
- usemap_map = alloc_bootmem(size);
+ usemap_map = memblock_virt_alloc(size, 0);
if (!usemap_map)
panic("can not allocate usemap_map\n");
alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
@@ -553,7 +558,7 @@ void __init sparse_init(void)
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
- map_map = alloc_bootmem(size2);
+ map_map = memblock_virt_alloc(size2, 0);
if (!map_map)
panic("can not allocate map_map\n");
alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
@@ -583,9 +588,9 @@ void __init sparse_init(void)
vmemmap_populate_print_last();
#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- free_bootmem(__pa(map_map), size2);
+ memblock_free_early(__pa(map_map), size2);
#endif
- free_bootmem(__pa(usemap_map), size);
+ memblock_free_early(__pa(usemap_map), size);
}
#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/mm/swap.c b/mm/swap.c
index 84b26aaabd0..9e8e3472248 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,7 +31,6 @@
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
-#include <linux/hugetlb.h>
#include "internal.h"
@@ -58,7 +57,7 @@ static void __page_cache_release(struct page *page)
spin_lock_irqsave(&zone->lru_lock, flags);
lruvec = mem_cgroup_page_lruvec(page, zone);
- VM_BUG_ON(!PageLRU(page));
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -68,7 +67,7 @@ static void __page_cache_release(struct page *page)
static void __put_single_page(struct page *page)
{
__page_cache_release(page);
- free_hot_cold_page(page, 0);
+ free_hot_cold_page(page, false);
}
static void __put_compound_page(struct page *page)
@@ -80,121 +79,185 @@ static void __put_compound_page(struct page *page)
(*dtor)(page);
}
-static void put_compound_page(struct page *page)
+/**
+ * Two special cases here: we could avoid taking compound_lock_irqsave
+ * and could skip the tail refcounting(in _mapcount).
+ *
+ * 1. Hugetlbfs page:
+ *
+ * PageHeadHuge will remain true until the compound page
+ * is released and enters the buddy allocator, and it could
+ * not be split by __split_huge_page_refcount().
+ *
+ * So if we see PageHeadHuge set, and we have the tail page pin,
+ * then we could safely put head page.
+ *
+ * 2. Slab THP page:
+ *
+ * PG_slab is cleared before the slab frees the head page, and
+ * tail pin cannot be the last reference left on the head page,
+ * because the slab code is free to reuse the compound page
+ * after a kfree/kmem_cache_free without having to check if
+ * there's any tail pin left. In turn all tail pinsmust be always
+ * released while the head is still pinned by the slab code
+ * and so we know PG_slab will be still set too.
+ *
+ * So if we see PageSlab set, and we have the tail page pin,
+ * then we could safely put head page.
+ */
+static __always_inline
+void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
{
- if (unlikely(PageTail(page))) {
- /* __split_huge_page_refcount can run under us */
- struct page *page_head = compound_trans_head(page);
-
- if (likely(page != page_head &&
- get_page_unless_zero(page_head))) {
- unsigned long flags;
-
- /*
- * THP can not break up slab pages so avoid taking
- * compound_lock(). Slab performs non-atomic bit ops
- * on page->flags for better performance. In particular
- * slab_unlock() in slub used to be a hot path. It is
- * still hot on arches that do not support
- * this_cpu_cmpxchg_double().
- */
- if (PageSlab(page_head) || PageHeadHuge(page_head)) {
- if (likely(PageTail(page))) {
- /*
- * __split_huge_page_refcount
- * cannot race here.
- */
- VM_BUG_ON(!PageHead(page_head));
- atomic_dec(&page->_mapcount);
- if (put_page_testzero(page_head))
- VM_BUG_ON(1);
- if (put_page_testzero(page_head))
- __put_compound_page(page_head);
- return;
- } else
- /*
- * __split_huge_page_refcount
- * run before us, "page" was a
- * THP tail. The split
- * page_head has been freed
- * and reallocated as slab or
- * hugetlbfs page of smaller
- * order (only possible if
- * reallocated as slab on
- * x86).
- */
- goto skip_lock;
- }
- /*
- * page_head wasn't a dangling pointer but it
- * may not be a head page anymore by the time
- * we obtain the lock. That is ok as long as it
- * can't be freed from under us.
- */
- flags = compound_lock_irqsave(page_head);
- if (unlikely(!PageTail(page))) {
- /* __split_huge_page_refcount run before us */
- compound_unlock_irqrestore(page_head, flags);
-skip_lock:
- if (put_page_testzero(page_head)) {
- /*
- * The head page may have been
- * freed and reallocated as a
- * compound page of smaller
- * order and then freed again.
- * All we know is that it
- * cannot have become: a THP
- * page, a compound page of
- * higher order, a tail page.
- * That is because we still
- * hold the refcount of the
- * split THP tail and
- * page_head was the THP head
- * before the split.
- */
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
- }
-out_put_single:
- if (put_page_testzero(page))
- __put_single_page(page);
- return;
- }
- VM_BUG_ON(page_head != page->first_page);
+ /*
+ * If @page is a THP tail, we must read the tail page
+ * flags after the head page flags. The
+ * __split_huge_page_refcount side enforces write memory barriers
+ * between clearing PageTail and before the head page
+ * can be freed and reallocated.
+ */
+ smp_rmb();
+ if (likely(PageTail(page))) {
+ /*
+ * __split_huge_page_refcount cannot race
+ * here, see the comment above this function.
+ */
+ VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
+ VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
+ if (put_page_testzero(page_head)) {
/*
- * We can release the refcount taken by
- * get_page_unless_zero() now that
- * __split_huge_page_refcount() is blocked on
- * the compound_lock.
+ * If this is the tail of a slab THP page,
+ * the tail pin must not be the last reference
+ * held on the page, because the PG_slab cannot
+ * be cleared before all tail pins (which skips
+ * the _mapcount tail refcounting) have been
+ * released.
+ *
+ * If this is the tail of a hugetlbfs page,
+ * the tail pin may be the last reference on
+ * the page instead, because PageHeadHuge will
+ * not go away until the compound page enters
+ * the buddy allocator.
*/
- if (put_page_testzero(page_head))
- VM_BUG_ON(1);
- /* __split_huge_page_refcount will wait now */
- VM_BUG_ON(page_mapcount(page) <= 0);
- atomic_dec(&page->_mapcount);
- VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
- VM_BUG_ON(atomic_read(&page->_count) != 0);
- compound_unlock_irqrestore(page_head, flags);
+ VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
+ __put_compound_page(page_head);
+ }
+ } else
+ /*
+ * __split_huge_page_refcount run before us,
+ * @page was a THP tail. The split @page_head
+ * has been freed and reallocated as slab or
+ * hugetlbfs page of smaller order (only
+ * possible if reallocated as slab on x86).
+ */
+ if (put_page_testzero(page))
+ __put_single_page(page);
+}
+
+static __always_inline
+void put_refcounted_compound_page(struct page *page_head, struct page *page)
+{
+ if (likely(page != page_head && get_page_unless_zero(page_head))) {
+ unsigned long flags;
+ /*
+ * @page_head wasn't a dangling pointer but it may not
+ * be a head page anymore by the time we obtain the
+ * lock. That is ok as long as it can't be freed from
+ * under us.
+ */
+ flags = compound_lock_irqsave(page_head);
+ if (unlikely(!PageTail(page))) {
+ /* __split_huge_page_refcount run before us */
+ compound_unlock_irqrestore(page_head, flags);
if (put_page_testzero(page_head)) {
+ /*
+ * The @page_head may have been freed
+ * and reallocated as a compound page
+ * of smaller order and then freed
+ * again. All we know is that it
+ * cannot have become: a THP page, a
+ * compound page of higher order, a
+ * tail page. That is because we
+ * still hold the refcount of the
+ * split THP tail and page_head was
+ * the THP head before the split.
+ */
if (PageHead(page_head))
__put_compound_page(page_head);
else
__put_single_page(page_head);
}
- } else {
- /* page_head is a dangling pointer */
- VM_BUG_ON(PageTail(page));
- goto out_put_single;
+out_put_single:
+ if (put_page_testzero(page))
+ __put_single_page(page);
+ return;
}
- } else if (put_page_testzero(page)) {
- if (PageHead(page))
- __put_compound_page(page);
- else
- __put_single_page(page);
+ VM_BUG_ON_PAGE(page_head != page->first_page, page);
+ /*
+ * We can release the refcount taken by
+ * get_page_unless_zero() now that
+ * __split_huge_page_refcount() is blocked on the
+ * compound_lock.
+ */
+ if (put_page_testzero(page_head))
+ VM_BUG_ON_PAGE(1, page_head);
+ /* __split_huge_page_refcount will wait now */
+ VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
+ atomic_dec(&page->_mapcount);
+ VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
+ VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
+ compound_unlock_irqrestore(page_head, flags);
+
+ if (put_page_testzero(page_head)) {
+ if (PageHead(page_head))
+ __put_compound_page(page_head);
+ else
+ __put_single_page(page_head);
+ }
+ } else {
+ /* @page_head is a dangling pointer */
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ goto out_put_single;
+ }
+}
+
+static void put_compound_page(struct page *page)
+{
+ struct page *page_head;
+
+ /*
+ * We see the PageCompound set and PageTail not set, so @page maybe:
+ * 1. hugetlbfs head page, or
+ * 2. THP head page.
+ */
+ if (likely(!PageTail(page))) {
+ if (put_page_testzero(page)) {
+ /*
+ * By the time all refcounts have been released
+ * split_huge_page cannot run anymore from under us.
+ */
+ if (PageHead(page))
+ __put_compound_page(page);
+ else
+ __put_single_page(page);
+ }
+ return;
}
+
+ /*
+ * We see the PageCompound set and PageTail set, so @page maybe:
+ * 1. a tail hugetlbfs page, or
+ * 2. a tail THP page, or
+ * 3. a split THP page.
+ *
+ * Case 3 is possible, as we may race with
+ * __split_huge_page_refcount tearing down a THP page.
+ */
+ page_head = compound_head_by_tail(page);
+ if (!__compound_tail_refcounted(page_head))
+ put_unrefcounted_compound_page(page_head, page);
+ else
+ put_refcounted_compound_page(page_head, page);
}
void put_page(struct page *page)
@@ -221,36 +284,37 @@ bool __get_page_tail(struct page *page)
* split_huge_page().
*/
unsigned long flags;
- bool got = false;
- struct page *page_head = compound_trans_head(page);
+ bool got;
+ struct page *page_head = compound_head(page);
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- /* Ref to put_compound_page() comment. */
- if (PageSlab(page_head) || PageHeadHuge(page_head)) {
- if (likely(PageTail(page))) {
- /*
- * This is a hugetlbfs page or a slab
- * page. __split_huge_page_refcount
- * cannot race here.
- */
- VM_BUG_ON(!PageHead(page_head));
- __get_page_tail_foll(page, false);
- return true;
- } else {
- /*
- * __split_huge_page_refcount run
- * before us, "page" was a THP
- * tail. The split page_head has been
- * freed and reallocated as slab or
- * hugetlbfs page of smaller order
- * (only possible if reallocated as
- * slab on x86).
- */
- put_page(page_head);
- return false;
- }
+ /* Ref to put_compound_page() comment. */
+ if (!__compound_tail_refcounted(page_head)) {
+ smp_rmb();
+ if (likely(PageTail(page))) {
+ /*
+ * This is a hugetlbfs page or a slab
+ * page. __split_huge_page_refcount
+ * cannot race here.
+ */
+ VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
+ __get_page_tail_foll(page, true);
+ return true;
+ } else {
+ /*
+ * __split_huge_page_refcount run
+ * before us, "page" was a THP
+ * tail. The split page_head has been
+ * freed and reallocated as slab or
+ * hugetlbfs page of smaller order
+ * (only possible if reallocated as
+ * slab on x86).
+ */
+ return false;
}
+ }
+ got = false;
+ if (likely(page != page_head && get_page_unless_zero(page_head))) {
/*
* page_head wasn't a dangling pointer but it
* may not be a head page anymore by the time
@@ -409,7 +473,7 @@ void rotate_reclaimable_page(struct page *page)
page_cache_get(page);
local_irq_save(flags);
- pvec = &__get_cpu_var(lru_rotate_pvecs);
+ pvec = this_cpu_ptr(&lru_rotate_pvecs);
if (!pagevec_add(pvec, page))
pagevec_move_tail(pvec);
local_irq_restore(flags);
@@ -542,6 +606,8 @@ void mark_page_accessed(struct page *page)
else
__lru_cache_activate_page(page);
ClearPageReferenced(page);
+ if (page_is_file_cache(page))
+ workingset_activation(page);
} else if (!PageReferenced(page)) {
SetPageReferenced(page);
}
@@ -549,12 +615,17 @@ void mark_page_accessed(struct page *page)
EXPORT_SYMBOL(mark_page_accessed);
/*
- * Queue the page for addition to the LRU via pagevec. The decision on whether
- * to add the page to the [in]active [file|anon] list is deferred until the
- * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
- * have the page added to the active list using mark_page_accessed().
+ * Used to mark_page_accessed(page) that is not visible yet and when it is
+ * still safe to use non-atomic ops
*/
-void __lru_cache_add(struct page *page)
+void init_page_accessed(struct page *page)
+{
+ if (!PageReferenced(page))
+ __SetPageReferenced(page);
+}
+EXPORT_SYMBOL(init_page_accessed);
+
+static void __lru_cache_add(struct page *page)
{
struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
@@ -564,16 +635,39 @@ void __lru_cache_add(struct page *page)
pagevec_add(pvec, page);
put_cpu_var(lru_add_pvec);
}
-EXPORT_SYMBOL(__lru_cache_add);
+
+/**
+ * lru_cache_add: add a page to the page lists
+ * @page: the page to add
+ */
+void lru_cache_add_anon(struct page *page)
+{
+ if (PageActive(page))
+ ClearPageActive(page);
+ __lru_cache_add(page);
+}
+
+void lru_cache_add_file(struct page *page)
+{
+ if (PageActive(page))
+ ClearPageActive(page);
+ __lru_cache_add(page);
+}
+EXPORT_SYMBOL(lru_cache_add_file);
/**
* lru_cache_add - add a page to a page list
* @page: the page to be added to the LRU.
+ *
+ * Queue the page for addition to the LRU via pagevec. The decision on whether
+ * to add the page to the [in]active [file|anon] list is deferred until the
+ * pagevec is drained. This gives a chance for the caller of lru_cache_add()
+ * have the page added to the active list using mark_page_accessed().
*/
void lru_cache_add(struct page *page)
{
- VM_BUG_ON(PageActive(page) && PageUnevictable(page));
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
__lru_cache_add(page);
}
@@ -779,7 +873,7 @@ void lru_add_drain_all(void)
* grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
* will free it.
*/
-void release_pages(struct page **pages, int nr, int cold)
+void release_pages(struct page **pages, int nr, bool cold)
{
int i;
LIST_HEAD(pages_to_free);
@@ -814,13 +908,13 @@ void release_pages(struct page **pages, int nr, int cold)
}
lruvec = mem_cgroup_page_lruvec(page, zone);
- VM_BUG_ON(!PageLRU(page));
+ VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
}
/* Clear Active bit in case of parallel mark_page_accessed */
- ClearPageActive(page);
+ __ClearPageActive(page);
list_add(&page->lru, &pages_to_free);
}
@@ -856,9 +950,9 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
{
const int file = 0;
- VM_BUG_ON(!PageHead(page));
- VM_BUG_ON(PageCompound(page_tail));
- VM_BUG_ON(PageLRU(page_tail));
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page_tail), page);
+ VM_BUG_ON_PAGE(PageLRU(page_tail), page);
VM_BUG_ON(NR_CPUS != 1 &&
!spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
@@ -897,7 +991,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
int active = PageActive(page);
enum lru_list lru = page_lru(page);
- VM_BUG_ON(PageLRU(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, lru);
@@ -916,6 +1010,57 @@ void __pagevec_lru_add(struct pagevec *pvec)
EXPORT_SYMBOL(__pagevec_lru_add);
/**
+ * pagevec_lookup_entries - gang pagecache lookup
+ * @pvec: Where the resulting entries are placed
+ * @mapping: The address_space to search
+ * @start: The starting entry index
+ * @nr_entries: The maximum number of entries
+ * @indices: The cache indices corresponding to the entries in @pvec
+ *
+ * pagevec_lookup_entries() will search for and return a group of up
+ * to @nr_entries pages and shadow entries in the mapping. All
+ * entries are placed in @pvec. pagevec_lookup_entries() takes a
+ * reference against actual pages in @pvec.
+ *
+ * The search returns a group of mapping-contiguous entries with
+ * ascending indexes. There may be holes in the indices due to
+ * not-present entries.
+ *
+ * pagevec_lookup_entries() returns the number of entries which were
+ * found.
+ */
+unsigned pagevec_lookup_entries(struct pagevec *pvec,
+ struct address_space *mapping,
+ pgoff_t start, unsigned nr_pages,
+ pgoff_t *indices)
+{
+ pvec->nr = find_get_entries(mapping, start, nr_pages,
+ pvec->pages, indices);
+ return pagevec_count(pvec);
+}
+
+/**
+ * pagevec_remove_exceptionals - pagevec exceptionals pruning
+ * @pvec: The pagevec to prune
+ *
+ * pagevec_lookup_entries() fills both pages and exceptional radix
+ * tree entries into the pagevec. This function prunes all
+ * exceptionals from @pvec without leaving holes, so that it can be
+ * passed on to page-only pagevec operations.
+ */
+void pagevec_remove_exceptionals(struct pagevec *pvec)
+{
+ int i, j;
+
+ for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ if (!radix_tree_exceptional_entry(page))
+ pvec->pages[j++] = page;
+ }
+ pvec->nr = j;
+}
+
+/**
* pagevec_lookup - gang pagecache lookup
* @pvec: Where the resulting pages are placed
* @mapping: The address_space to search
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e6f15f8ca2a..2972eee184a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void)
return ret;
}
+static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
+
void show_swap_cache_info(void)
{
printk("%lu pages in swap cache\n", total_swapcache_pages());
@@ -83,9 +85,9 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
int error;
struct address_space *address_space;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(PageSwapCache(page));
- VM_BUG_ON(!PageSwapBacked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
page_cache_get(page);
SetPageSwapCache(page);
@@ -139,9 +141,9 @@ void __delete_from_swap_cache(struct page *page)
swp_entry_t entry;
struct address_space *address_space;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(!PageSwapCache(page));
- VM_BUG_ON(PageWriteback(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
entry.val = page_private(page);
address_space = swap_address_space(entry);
@@ -165,8 +167,8 @@ int add_to_swap(struct page *page, struct list_head *list)
swp_entry_t entry;
int err;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(!PageUptodate(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageUptodate(page), page);
entry = get_swap_page();
if (!entry.val)
@@ -268,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
for (i = 0; i < todo; i++)
free_swap_cache(pagep[i]);
- release_pages(pagep, todo, 0);
+ release_pages(pagep, todo, false);
pagep += todo;
nr -= todo;
}
@@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry)
page = find_get_page(swap_address_space(entry), entry.val);
- if (page)
+ if (page) {
INC_CACHE_INFO(find_success);
+ if (TestClearPageReadahead(page))
+ atomic_inc(&swapin_readahead_hits);
+ }
INC_CACHE_INFO(find_total);
return page;
@@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return found_page;
}
+static unsigned long swapin_nr_pages(unsigned long offset)
+{
+ static unsigned long prev_offset;
+ unsigned int pages, max_pages, last_ra;
+ static atomic_t last_readahead_pages;
+
+ max_pages = 1 << ACCESS_ONCE(page_cluster);
+ if (max_pages <= 1)
+ return 1;
+
+ /*
+ * This heuristic has been found to work well on both sequential and
+ * random loads, swapping to hard disk or to SSD: please don't ask
+ * what the "+ 2" means, it just happens to work well, that's all.
+ */
+ pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
+ if (pages == 2) {
+ /*
+ * We can have no readahead hits to judge by: but must not get
+ * stuck here forever, so check for an adjacent offset instead
+ * (and don't even bother to check whether swap type is same).
+ */
+ if (offset != prev_offset + 1 && offset != prev_offset - 1)
+ pages = 1;
+ prev_offset = offset;
+ } else {
+ unsigned int roundup = 4;
+ while (roundup < pages)
+ roundup <<= 1;
+ pages = roundup;
+ }
+
+ if (pages > max_pages)
+ pages = max_pages;
+
+ /* Don't shrink readahead too fast */
+ last_ra = atomic_read(&last_readahead_pages) / 2;
+ if (pages < last_ra)
+ pages = last_ra;
+ atomic_set(&last_readahead_pages, pages);
+
+ return pages;
+}
+
/**
* swapin_readahead - swap in pages in hope we need them soon
* @entry: swap entry of this memory
@@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr)
{
struct page *page;
- unsigned long offset = swp_offset(entry);
+ unsigned long entry_offset = swp_offset(entry);
+ unsigned long offset = entry_offset;
unsigned long start_offset, end_offset;
- unsigned long mask = (1UL << page_cluster) - 1;
+ unsigned long mask;
struct blk_plug plug;
+ mask = swapin_nr_pages(offset) - 1;
+ if (!mask)
+ goto skip;
+
/* Read a page_cluster sized and aligned cluster around offset. */
start_offset = offset & ~mask;
end_offset = offset | mask;
@@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
gfp_mask, vma, addr);
if (!page)
continue;
+ if (offset != entry_offset)
+ SetPageReadahead(page);
page_cache_release(page);
}
blk_finish_plug(&plug);
lru_add_drain(); /* Push any new pages onto the LRU now */
+skip:
return read_swap_cache_async(entry, gfp_mask, vma, addr);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 612a7c9795f..4c524f7bd0b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages;
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
static int least_priority;
-static atomic_t highest_priority_index = ATOMIC_INIT(-1);
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
-struct swap_list_t swap_list = {-1, -1};
+/*
+ * all active swap_info_structs
+ * protected with swap_lock, and ordered by priority.
+ */
+PLIST_HEAD(swap_active_head);
+
+/*
+ * all available (active, not full) swap_info_structs
+ * protected with swap_avail_lock, ordered by priority.
+ * This is used by get_swap_page() instead of swap_active_head
+ * because swap_active_head includes all swap_info_structs,
+ * but get_swap_page() doesn't need to look at full ones.
+ * This uses its own lock instead of swap_lock because when a
+ * swap_info_struct changes between not-full/full, it needs to
+ * add/remove itself to/from this list, but the swap_info_struct->lock
+ * is held and the locking order requires swap_lock to be taken
+ * before any swap_info_struct->lock.
+ */
+static PLIST_HEAD(swap_avail_head);
+static DEFINE_SPINLOCK(swap_avail_lock);
struct swap_info_struct *swap_info[MAX_SWAPFILES];
@@ -505,13 +523,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
/*
* If seek is expensive, start searching for new cluster from
* start of partition, to minimize the span of allocated swap.
- * But if seek is cheap, search from our current position, so
- * that swap is allocated from all over the partition: if the
- * Flash Translation Layer only remaps within limited zones,
- * we don't want to wear out the first zone too quickly.
+ * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
+ * case, just handled by scan_swap_map_try_ssd_cluster() above.
*/
- if (!(si->flags & SWP_SOLIDSTATE))
- scan_base = offset = si->lowest_bit;
+ scan_base = offset = si->lowest_bit;
last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
/* Locate the first empty (unaligned) cluster */
@@ -531,26 +546,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
}
}
- offset = si->lowest_bit;
- last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
-
- /* Locate the first empty (unaligned) cluster */
- for (; last_in_cluster < scan_base; offset++) {
- if (si->swap_map[offset])
- last_in_cluster = offset + SWAPFILE_CLUSTER;
- else if (offset == last_in_cluster) {
- spin_lock(&si->lock);
- offset -= SWAPFILE_CLUSTER - 1;
- si->cluster_next = offset;
- si->cluster_nr = SWAPFILE_CLUSTER - 1;
- goto checks;
- }
- if (unlikely(--latency_ration < 0)) {
- cond_resched();
- latency_ration = LATENCY_LIMIT;
- }
- }
-
offset = scan_base;
spin_lock(&si->lock);
si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -591,6 +586,9 @@ checks:
if (si->inuse_pages == si->pages) {
si->lowest_bit = si->max;
si->highest_bit = 0;
+ spin_lock(&swap_avail_lock);
+ plist_del(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
}
si->swap_map[offset] = usage;
inc_cluster_info_page(si, si->cluster_info, offset);
@@ -616,7 +614,7 @@ scan:
}
}
offset = si->lowest_bit;
- while (++offset < scan_base) {
+ while (offset < scan_base) {
if (!si->swap_map[offset]) {
spin_lock(&si->lock);
goto checks;
@@ -629,6 +627,7 @@ scan:
cond_resched();
latency_ration = LATENCY_LIMIT;
}
+ offset++;
}
spin_lock(&si->lock);
@@ -639,71 +638,65 @@ no_page:
swp_entry_t get_swap_page(void)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si, *next;
pgoff_t offset;
- int type, next;
- int wrapped = 0;
- int hp_index;
- spin_lock(&swap_lock);
if (atomic_long_read(&nr_swap_pages) <= 0)
goto noswap;
atomic_long_dec(&nr_swap_pages);
- for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
- hp_index = atomic_xchg(&highest_priority_index, -1);
- /*
- * highest_priority_index records current highest priority swap
- * type which just frees swap entries. If its priority is
- * higher than that of swap_list.next swap type, we use it. It
- * isn't protected by swap_lock, so it can be an invalid value
- * if the corresponding swap type is swapoff. We double check
- * the flags here. It's even possible the swap type is swapoff
- * and swapon again and its priority is changed. In such rare
- * case, low prority swap type might be used, but eventually
- * high priority swap will be used after several rounds of
- * swap.
- */
- if (hp_index != -1 && hp_index != type &&
- swap_info[type]->prio < swap_info[hp_index]->prio &&
- (swap_info[hp_index]->flags & SWP_WRITEOK)) {
- type = hp_index;
- swap_list.next = type;
- }
-
- si = swap_info[type];
- next = si->next;
- if (next < 0 ||
- (!wrapped && si->prio != swap_info[next]->prio)) {
- next = swap_list.head;
- wrapped++;
- }
+ spin_lock(&swap_avail_lock);
+start_over:
+ plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+ /* requeue si to after same-priority siblings */
+ plist_requeue(&si->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
spin_lock(&si->lock);
- if (!si->highest_bit) {
- spin_unlock(&si->lock);
- continue;
- }
- if (!(si->flags & SWP_WRITEOK)) {
+ if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ if (plist_node_empty(&si->avail_list)) {
+ spin_unlock(&si->lock);
+ goto nextsi;
+ }
+ WARN(!si->highest_bit,
+ "swap_info %d in list but !highest_bit\n",
+ si->type);
+ WARN(!(si->flags & SWP_WRITEOK),
+ "swap_info %d in list but !SWP_WRITEOK\n",
+ si->type);
+ plist_del(&si->avail_list, &swap_avail_head);
spin_unlock(&si->lock);
- continue;
+ goto nextsi;
}
- swap_list.next = next;
-
- spin_unlock(&swap_lock);
/* This is called for allocating swap entry for cache */
offset = scan_swap_map(si, SWAP_HAS_CACHE);
spin_unlock(&si->lock);
if (offset)
- return swp_entry(type, offset);
- spin_lock(&swap_lock);
- next = swap_list.next;
+ return swp_entry(si->type, offset);
+ pr_debug("scan_swap_map of si %d failed to find offset\n",
+ si->type);
+ spin_lock(&swap_avail_lock);
+nextsi:
+ /*
+ * if we got here, it's likely that si was almost full before,
+ * and since scan_swap_map() can drop the si->lock, multiple
+ * callers probably all tried to get a page from the same si
+ * and it filled up before we could get one; or, the si filled
+ * up between us dropping swap_avail_lock and taking si->lock.
+ * Since we dropped the swap_avail_lock, the swap_avail_head
+ * list may have been modified; so if next is still in the
+ * swap_avail_head list then try it, otherwise start over.
+ */
+ if (plist_node_empty(&next->avail_list))
+ goto start_over;
}
+ spin_unlock(&swap_avail_lock);
+
atomic_long_inc(&nr_swap_pages);
noswap:
- spin_unlock(&swap_lock);
return (swp_entry_t) {0};
}
@@ -765,27 +758,6 @@ out:
return NULL;
}
-/*
- * This swap type frees swap entry, check if it is the highest priority swap
- * type which just frees swap entry. get_swap_page() uses
- * highest_priority_index to search highest priority swap type. The
- * swap_info_struct.lock can't protect us if there are multiple swap types
- * active, so we use atomic_cmpxchg.
- */
-static void set_highest_priority_index(int type)
-{
- int old_hp_index, new_hp_index;
-
- do {
- old_hp_index = atomic_read(&highest_priority_index);
- if (old_hp_index != -1 &&
- swap_info[old_hp_index]->prio >= swap_info[type]->prio)
- break;
- new_hp_index = type;
- } while (atomic_cmpxchg(&highest_priority_index,
- old_hp_index, new_hp_index) != old_hp_index);
-}
-
static unsigned char swap_entry_free(struct swap_info_struct *p,
swp_entry_t entry, unsigned char usage)
{
@@ -827,9 +799,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
dec_cluster_info_page(p, p->cluster_info, offset);
if (offset < p->lowest_bit)
p->lowest_bit = offset;
- if (offset > p->highest_bit)
+ if (offset > p->highest_bit) {
+ bool was_full = !p->highest_bit;
p->highest_bit = offset;
- set_highest_priority_index(p->type);
+ if (was_full && (p->flags & SWP_WRITEOK)) {
+ spin_lock(&swap_avail_lock);
+ WARN_ON(!plist_node_empty(&p->avail_list));
+ if (plist_node_empty(&p->avail_list))
+ plist_add(&p->avail_list,
+ &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
+ }
+ }
atomic_long_inc(&nr_swap_pages);
p->inuse_pages--;
frontswap_invalidate_page(p->type, offset);
@@ -906,7 +887,7 @@ int reuse_swap_page(struct page *page)
{
int count;
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
return 0;
count = page_mapcount(page);
@@ -926,7 +907,7 @@ int reuse_swap_page(struct page *page)
*/
int try_to_free_swap(struct page *page)
{
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (!PageSwapCache(page))
return 0;
@@ -1764,30 +1745,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
struct swap_cluster_info *cluster_info)
{
- int i, prev;
-
if (prio >= 0)
p->prio = prio;
else
p->prio = --least_priority;
+ /*
+ * the plist prio is negated because plist ordering is
+ * low-to-high, while swap ordering is high-to-low
+ */
+ p->list.prio = -p->prio;
+ p->avail_list.prio = -p->prio;
p->swap_map = swap_map;
p->cluster_info = cluster_info;
p->flags |= SWP_WRITEOK;
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
- /* insert swap space into swap_list: */
- prev = -1;
- for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
- if (p->prio >= swap_info[i]->prio)
- break;
- prev = i;
- }
- p->next = i;
- if (prev < 0)
- swap_list.head = swap_list.next = p->type;
- else
- swap_info[prev]->next = p->type;
+ assert_spin_locked(&swap_lock);
+ /*
+ * both lists are plists, and thus priority ordered.
+ * swap_active_head needs to be priority ordered for swapoff(),
+ * which on removal of any swap_info_struct with an auto-assigned
+ * (i.e. negative) priority increments the auto-assigned priority
+ * of any lower-priority swap_info_structs.
+ * swap_avail_head needs to be priority ordered for get_swap_page(),
+ * which allocates swap pages from the highest available priority
+ * swap_info_struct.
+ */
+ plist_add(&p->list, &swap_active_head);
+ spin_lock(&swap_avail_lock);
+ plist_add(&p->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -1822,8 +1810,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct address_space *mapping;
struct inode *inode;
struct filename *pathname;
- int i, type, prev;
- int err;
+ int err, found = 0;
unsigned int old_block_size;
if (!capable(CAP_SYS_ADMIN))
@@ -1841,17 +1828,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
goto out;
mapping = victim->f_mapping;
- prev = -1;
spin_lock(&swap_lock);
- for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
- p = swap_info[type];
+ plist_for_each_entry(p, &swap_active_head, list) {
if (p->flags & SWP_WRITEOK) {
- if (p->swap_file->f_mapping == mapping)
+ if (p->swap_file->f_mapping == mapping) {
+ found = 1;
break;
+ }
}
- prev = type;
}
- if (type < 0) {
+ if (!found) {
err = -EINVAL;
spin_unlock(&swap_lock);
goto out_dput;
@@ -1863,20 +1849,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
goto out_dput;
}
- if (prev < 0)
- swap_list.head = p->next;
- else
- swap_info[prev]->next = p->next;
- if (type == swap_list.next) {
- /* just pick something that's safe... */
- swap_list.next = swap_list.head;
- }
+ spin_lock(&swap_avail_lock);
+ plist_del(&p->avail_list, &swap_avail_head);
+ spin_unlock(&swap_avail_lock);
spin_lock(&p->lock);
if (p->prio < 0) {
- for (i = p->next; i >= 0; i = swap_info[i]->next)
- swap_info[i]->prio = p->prio--;
+ struct swap_info_struct *si = p;
+
+ plist_for_each_entry_continue(si, &swap_active_head, list) {
+ si->prio++;
+ si->list.prio--;
+ si->avail_list.prio--;
+ }
least_priority++;
}
+ plist_del(&p->list, &swap_active_head);
atomic_long_sub(p->pages, &nr_swap_pages);
total_swap_pages -= p->pages;
p->flags &= ~SWP_WRITEOK;
@@ -1884,7 +1871,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
set_current_oom_origin();
- err = try_to_unuse(type, false, 0); /* force all pages to be unused */
+ err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
clear_current_oom_origin();
if (err) {
@@ -1922,11 +1909,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->swap_map = NULL;
cluster_info = p->cluster_info;
p->cluster_info = NULL;
- p->flags = 0;
frontswap_map = frontswap_map_get(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
- frontswap_invalidate_area(type);
+ frontswap_invalidate_area(p->type);
frontswap_map_set(p, NULL);
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
@@ -1935,7 +1921,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
vfree(cluster_info);
vfree(frontswap_map);
/* Destroy swap account information */
- swap_cgroup_swapoff(type);
+ swap_cgroup_swapoff(p->type);
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
@@ -1948,6 +1934,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
mutex_unlock(&inode->i_mutex);
}
filp_close(swap_file, NULL);
+
+ /*
+ * Clear the SWP_USED flag after all resources are freed so that swapon
+ * can reuse this swap_info in alloc_swap_info() safely. It is ok to
+ * not hold p->lock after we cleared its SWP_WRITEOK.
+ */
+ spin_lock(&swap_lock);
+ p->flags = 0;
+ spin_unlock(&swap_lock);
+
err = 0;
atomic_inc(&proc_poll_event);
wake_up_interruptible(&proc_poll_wait);
@@ -2132,8 +2128,9 @@ static struct swap_info_struct *alloc_swap_info(void)
*/
}
INIT_LIST_HEAD(&p->first_swap_extent.list);
+ plist_node_init(&p->list, 0);
+ plist_node_init(&p->avail_list, 0);
p->flags = SWP_USED;
- p->next = -1;
spin_unlock(&swap_lock);
spin_lock_init(&p->lock);
@@ -2714,7 +2711,7 @@ struct swap_info_struct *page_swap_info(struct page *page)
*/
struct address_space *__page_file_mapping(struct page *page)
{
- VM_BUG_ON(!PageSwapCache(page));
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
return page_swap_info(page)->swap_file->f_mapping;
}
EXPORT_SYMBOL_GPL(__page_file_mapping);
@@ -2722,7 +2719,7 @@ EXPORT_SYMBOL_GPL(__page_file_mapping);
pgoff_t __page_file_index(struct page *page)
{
swp_entry_t swap = { .val = page_private(page) };
- VM_BUG_ON(!PageSwapCache(page));
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
return swp_offset(swap);
}
EXPORT_SYMBOL_GPL(__page_file_index);
diff --git a/mm/truncate.c b/mm/truncate.c