diff options
68 files changed, 2454 insertions, 569 deletions
diff --git a/Documentation/devicetree/bindings/net/arc_emac.txt b/Documentation/devicetree/bindings/net/arc_emac.txt new file mode 100644 index 00000000000..bcbc3f00915 --- /dev/null +++ b/Documentation/devicetree/bindings/net/arc_emac.txt @@ -0,0 +1,38 @@ +* Synopsys ARC EMAC 10/100 Ethernet driver (EMAC) + +Required properties: +- compatible: Should be "snps,arc-emac" +- reg: Address and length of the register set for the device +- interrupts: Should contain the EMAC interrupts +- clock-frequency: CPU frequency. It is needed to calculate and set polling +period of EMAC. +- max-speed: Maximum supported data-rate in Mbit/s. In some HW configurations +bandwidth of external memory controller might be a limiting factor. That's why +it's required to specify which data-rate is supported on current SoC or FPGA. +For example if only 10 Mbit/s is supported (10BASE-T) set "10". If 100 Mbit/s is +supported (100BASE-TX) set "100". +- phy: PHY device attached to the EMAC via MDIO bus + +Child nodes of the driver are the individual PHY devices connected to the +MDIO bus. They must have a "reg" property given the PHY address on the MDIO bus. + +Optional properties: +- mac-address: 6 bytes, mac address + +Examples: + + ethernet@c0fc2000 { + compatible = "snps,arc-emac"; + reg = <0xc0fc2000 0x3c>; + interrupts = <6>; + mac-address = [ 00 11 22 33 44 55 ]; + clock-frequency = <80000000>; + max-speed = <100>; + phy = <&phy0>; + + #address-cells = <1>; + #size-cells = <0>; + phy0: ethernet-phy@0 { + reg = <1>; + }; + }; diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt index e7454fcc917..87bbcfee2e0 100644 --- a/Documentation/networking/bonding.txt +++ b/Documentation/networking/bonding.txt @@ -303,6 +303,12 @@ arp_validate such a situation, validation of backup slaves must be disabled. + The validation of ARP requests on backup slaves is mainly + helping bonding to decide which slaves are more likely to + work in case of the active slave failure, it doesn't really + guarantee that the backup slave will work if it's selected + as the next active slave. + This option is useful in network configurations in which multiple bonding hosts are concurrently issuing ARPs to one or more targets beyond a common switch. Should the link between @@ -315,6 +321,25 @@ arp_validate This option was added in bonding version 3.1.0. +arp_all_targets + + Specifies the quantity of arp_ip_targets that must be reachable + in order for the ARP monitor to consider a slave as being up. + This option affects only active-backup mode for slaves with + arp_validation enabled. + + Possible values are: + + any or 0 + + consider the slave up only when any of the arp_ip_targets + is reachable + + all or 1 + + consider the slave up only when all of the arp_ip_targets + are reachable + downdelay Specifies the time, in milliseconds, to wait before disabling diff --git a/Documentation/networking/netlink_mmap.txt b/Documentation/networking/netlink_mmap.txt index e6088baf109..5cc60058777 100644 --- a/Documentation/networking/netlink_mmap.txt +++ b/Documentation/networking/netlink_mmap.txt @@ -274,9 +274,9 @@ This example assumes some ring parameters of the ring setup are available. /* Get next frame header */ hdr = rx_ring + frame_offset; - if (hdr->nm_status == NL_MMAP_STATUS_VALID) + if (hdr->nm_status == NL_MMAP_STATUS_VALID) { /* Regular memory mapped frame */ - nlh = (void *hdr) + NL_MMAP_HDRLEN; + nlh = (void *)hdr + NL_MMAP_HDRLEN; len = hdr->nm_len; /* Release empty message immediately. May happen diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 5369879eafe..e658bbfb641 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -50,13 +50,27 @@ The maximum number of packets that kernel can handle on a NAPI interrupt, it's a Per-CPU variable. Default: 64 -low_latency_poll +low_latency_read ---------------- -Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL) +Low latency busy poll timeout for socket reads. (needs CONFIG_NET_LL_RX_POLL) Approximate time in us to spin waiting for packets on the device queue. +This sets the default value of the SO_LL socket option. +Can be set or overridden per socket by setting socket option SO_LL. Recommended value is 50. May increase power usage. Default: 0 (off) +low_latency_poll +---------------- +Low latency busy poll timeout for poll and select. (needs CONFIG_NET_LL_RX_POLL) +Approximate time in us to spin waiting for packets on the device queue. +Recommended value depends on the number of sockets you poll on. +For several sockets 50, for several hundreds 100. +For more than that you probably want to use epoll. +Note that only sockets with SO_LL set will be busy polled, so you want to either +selectively set SO_LL on those sockets or set sysctl.net.low_latency_read globally. +May increase power usage. +Default: 0 (off) + rmem_default ------------ diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 00aba08f01a..b45b240889f 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -240,6 +240,16 @@ config VIRTIO_NET This is the virtual network driver for virtio. It can be used with lguest or QEMU based VMMs (like KVM or Xen). Say Y or M. +config NLMON + tristate "Virtual netlink monitoring device" + ---help--- + This option enables a monitoring net device for netlink skbs. The + purpose of this is to analyze netlink messages with packet sockets. + Thus applications like tcpdump will be able to see local netlink + messages if they tap into the netlink device, record pcaps for further + diagnostics, etc. This is mostly intended for developers or support + to debug netlink issues. If unsure, say N. + endif # NET_CORE config SUNGEM_PHY diff --git a/drivers/net/Makefile b/drivers/net/Makefile index ef3d090efed..3fef8a81c0f 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_TUN) += tun.o obj-$(CONFIG_VETH) += veth.o obj-$(CONFIG_VIRTIO_NET) += virtio_net.o obj-$(CONFIG_VXLAN) += vxlan.o +obj-$(CONFIG_NLMON) += nlmon.o # # Networking Drivers diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 3b31c19972d..142d55dc526 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -104,6 +104,7 @@ static char *xmit_hash_policy; static int arp_interval = BOND_LINK_ARP_INTERV; static char *arp_ip_target[BOND_MAX_ARP_TARGETS]; static char *arp_validate; +static char *arp_all_targets; static char *fail_over_mac; static int all_slaves_active = 0; static struct bond_params bonding_defaults; @@ -166,6 +167,8 @@ module_param(arp_validate, charp, 0); MODULE_PARM_DESC(arp_validate, "validate src/dst of ARP probes; " "0 for none (default), 1 for active, " "2 for backup, 3 for all"); +module_param(arp_all_targets, charp, 0); +MODULE_PARM_DESC(arp_all_targets, "fail on any/all arp targets timeout; 0 for any (default), 1 for all"); module_param(fail_over_mac, charp, 0); MODULE_PARM_DESC(fail_over_mac, "For active-backup, do not set all slaves to " "the same MAC; 0 for none (default), " @@ -216,6 +219,12 @@ const struct bond_parm_tbl xmit_hashtype_tbl[] = { { NULL, -1}, }; +const struct bond_parm_tbl arp_all_targets_tbl[] = { +{ "any", BOND_ARP_TARGETS_ANY}, +{ "all", BOND_ARP_TARGETS_ALL}, +{ NULL, -1}, +}; + const struct bond_parm_tbl arp_validate_tbl[] = { { "none", BOND_ARP_VALIDATE_NONE}, { "active", BOND_ARP_VALIDATE_ACTIVE}, @@ -1483,7 +1492,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) struct slave *new_slave = NULL; struct sockaddr addr; int link_reporting; - int res = 0; + int res = 0, i; if (!bond->params.use_carrier && slave_dev->ethtool_ops->get_link == NULL && @@ -1712,6 +1721,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) new_slave->last_arp_rx = jiffies - (msecs_to_jiffies(bond->params.arp_interval) + 1); + for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) + new_slave->target_last_arp_rx[i] = new_slave->last_arp_rx; if (bond->params.miimon && !bond->params.use_carrier) { link_reporting = bond_check_dev_link(bond, slave_dev, 1); @@ -2611,18 +2622,19 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) static void bond_validate_arp(struct bonding *bond, struct slave *slave, __be32 sip, __be32 tip) { int i; - __be32 *targets = bond->params.arp_targets; - for (i = 0; (i < BOND_MAX_ARP_TARGETS) && targets[i]; i++) { - pr_debug("bva: sip %pI4 tip %pI4 t[%d] %pI4 bhti(tip) %d\n", - &sip, &tip, i, &targets[i], - bond_has_this_ip(bond, tip)); - if (sip == targets[i]) { - if (bond_has_this_ip(bond, tip)) - slave->last_arp_rx = jiffies; - return; - } + if (!sip || !bond_has_this_ip(bond, tip)) { + pr_debug("bva: sip %pI4 tip %pI4 not found\n", &sip, &tip); + return; + } + + i = bond_get_targets_ip(bond->params.arp_targets, sip); + if (i == -1) { + pr_debug("bva: sip %pI4 not found in targets\n", &sip); + return; } + slave->last_arp_rx = jiffies; + slave->target_last_arp_rx[i] = jiffies; } static int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond, @@ -2637,6 +2649,10 @@ static int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond, return RX_HANDLER_ANOTHER; read_lock(&bond->lock); + + if (!slave_do_arp_validate(bond, slave)) + goto out_unlock; + alen = arp_hdr_len(bond->dev); pr_debug("bond_arp_rcv: bond %s skb->dev %s\n", @@ -2676,10 +2692,17 @@ static int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond, * configuration, the ARP probe will (hopefully) travel from * the active, through one switch, the router, then the other * switch before reaching the backup. + * + * We 'trust' the arp requests if there is an active slave and + * it received valid arp reply(s) after it became active. This + * is done to avoid endless looping when we can't reach the + * arp_ip_target and fool ourselves with our own arp requests. */ if (bond_is_active_slave(slave)) bond_validate_arp(bond, slave, sip, tip); - else + else if (bond->curr_active_slave && + time_after(slave_last_rx(bond, bond->curr_active_slave), + bond->curr_active_slave->jiffies)) bond_validate_arp(bond, slave, tip, sip); out_unlock: @@ -4401,6 +4424,7 @@ int bond_parse_parm(const char *buf, const struct bond_parm_tbl *tbl) static int bond_check_params(struct bond_params *params) { int arp_validate_value, fail_over_mac_value, primary_reselect_value, i; + int arp_all_targets_value; /* * Convert string parameters. @@ -4591,7 +4615,11 @@ static int bond_check_params(struct bond_params *params) arp_ip_target[i]); arp_interval = 0; } else { - arp_target[arp_ip_count++] = ip; + if (bond_get_targets_ip(arp_target, ip) == -1) + arp_target[arp_ip_count++] = ip; + else + pr_warning("Warning: duplicate address %pI4 in arp_ip_target, skipping\n", + &ip); } } @@ -4622,6 +4650,18 @@ static int bond_check_params(struct bond_params *params) } else arp_validate_value = 0; + arp_all_targets_value = 0; + if (arp_all_targets) { + arp_all_targets_value = bond_parse_parm(arp_all_targets, + arp_all_targets_tbl); + + if (arp_all_targets_value == -1) { + pr_err("Error: invalid arp_all_targets_value \"%s\"\n", + arp_all_targets); + arp_all_targets_value = 0; + } + } + if (miimon) { pr_info("MII link monitoring set to %d ms\n", miimon); } else if (arp_interval) { @@ -4686,6 +4726,7 @@ static int bond_check_params(struct bond_params *params) params->num_peer_notif = num_peer_notif; params->arp_interval = arp_interval; params->arp_validate = arp_validate_value; + params->arp_all_targets = arp_all_targets_value; params->updelay = updelay; params->downdelay = downdelay; params->use_carrier = use_carrier; @@ -4839,7 +4880,7 @@ static int __net_init bond_net_init(struct net *net) bond_create_proc_dir(bn); bond_create_sysfs(bn); - + return 0; } diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c index f8bee4c0cbf..dc36a3d7d9e 100644 --- a/drivers/net/bonding/bond_sysfs.c +++ b/drivers/net/bonding/bond_sysfs.c @@ -443,6 +443,44 @@ static ssize_t bonding_store_arp_validate(struct device *d, static DEVICE_ATTR(arp_validate, S_IRUGO | S_IWUSR, bonding_show_arp_validate, bonding_store_arp_validate); +/* + * Show and set arp_all_targets. + */ +static ssize_t bonding_show_arp_all_targets(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct bonding *bond = to_bond(d); + int value = bond->params.arp_all_targets; + + return sprintf(buf, "%s %d\n", arp_all_targets_tbl[value].modename, + value); +} + +static ssize_t bonding_store_arp_all_targets(struct device *d, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct bonding *bond = to_bond(d); + int new_value; + + new_value = bond_parse_parm(buf, arp_all_targets_tbl); + if (new_value < 0) { + pr_err("%s: Ignoring invalid arp_all_targets value %s\n", + bond->dev->name, buf); + return -EINVAL; + } + pr_info("%s: setting arp_all_targets to %s (%d).\n", + bond->dev->name, arp_all_targets_tbl[new_value].modename, + new_value); + + bond->params.arp_all_targets = new_value; + + return count; +} + +static DEVICE_ATTR(arp_all_targets, S_IRUGO | S_IWUSR, + bonding_show_arp_all_targets, bonding_store_arp_all_targets); /* * Show and store fail_over_mac. User only allowed to change the @@ -590,10 +628,11 @@ static ssize_t bonding_store_arp_targets(struct device *d, struct device_attribute *attr, const char *buf, size_t count) { - __be32 newtarget; - int i = 0, done = 0, ret = count; struct bonding *bond = to_bond(d); - __be32 *targets; + struct slave *slave; + __be32 newtarget, *targets; + unsigned long *targets_rx; + int ind, i, j, ret = -EINVAL; targets = bond->params.arp_targets; newtarget = in_aton(buf + 1); @@ -602,57 +641,63 @@ static ssize_t bonding_store_arp_targets(struct device *d, if ((newtarget == 0) || (newtarget == htonl(INADDR_BROADCAST))) { pr_err("%s: invalid ARP target %pI4 specified for addition\n", bond->dev->name, &newtarget); - ret = -EINVAL; goto out; } - /* look for an empty slot to put the target in, and check for dupes */ - for (i = 0; (i < BOND_MAX_ARP_TARGETS) && !done; i++) { - if (targets[i] == newtarget) { /* duplicate */ - pr_err("%s: ARP target %pI4 is already present\n", - bond->dev->name, &newtarget); - ret = -EINVAL; - goto out; - } - if (targets[i] == 0) { - pr_info("%s: adding ARP target %pI4.\n", - bond->dev->name, &newtarget); - done = 1; - targets[i] = newtarget; - } + + if (bond_get_targets_ip(targets, newtarget) != -1) { /* dup */ + pr_err("%s: ARP target %pI4 is already present\n", + bond->dev->name, &newtarget); + goto out; } - if (!done) { + + ind = bond_get_targets_ip(targets, 0); /* first free slot */ + if (ind == -1) { pr_err("%s: ARP target table is full!\n", bond->dev->name); - ret = -EINVAL; goto out; } + pr_info("%s: adding ARP target %pI4.\n", bond->dev->name, + &newtarget); + /* not to race with bond_arp_rcv */ + write_lock_bh(&bond->lock); + bond_for_each_slave(bond, slave, i) + slave->target_last_arp_rx[ind] = jiffies; + targets[ind] = newtarget; + write_unlock_bh(&bond->lock); } else if (buf[0] == '-') { if ((newtarget == 0) || (newtarget == htonl(INADDR_BROADCAST))) { pr_err("%s: invalid ARP target %pI4 specified for removal\n", bond->dev->name, &newtarget); - ret = -EINVAL; goto out; } - for (i = 0; (i < BOND_MAX_ARP_TARGETS) && !done; i++) { - if (targets[i] == newtarget) { - int j; - pr_info("%s: removing ARP target %pI4.\n", - bond->dev->name, &newtarget); - for (j = i; (j < (BOND_MAX_ARP_TARGETS-1)) && targets[j+1]; j++) - targets[j] = targets[j+1]; - - targets[j] = 0; - done = 1; - } - } - if (!done) { - pr_info("%s: unable to remove nonexistent ARP target %pI4.\n", + ind = bond_get_targets_ip(targets, newtarget); + if (ind == -1) { + pr_err("%s: unable to remove nonexistent ARP target %pI4.\n", bond->dev->name, &newtarget); - ret = -EINVAL; goto out; } + + if (ind == 0 && !targets[1] && bond->params.arp_interval) + pr_warn("%s: removing last arp target with arp_interval on\n", + bond->dev->name); + + pr_info("%s: removing ARP target %pI4.\n", bond->dev->name, + &newtarget); + + write_lock_bh(&bond->lock); + bond_for_each_slave(bond, slave, i) { + targets_rx = slave->target_last_arp_rx; + j = ind; + for (; (j < BOND_MAX_ARP_TARGETS-1) && targets[j+1]; j++) + targets_rx[j] = targets_rx[j+1]; + targets_rx[j] = 0; + } + for (i = ind; (i < BOND_MAX_ARP_TARGETS-1) && targets[i+1]; i++) + targets[i] = targets[i+1]; + targets[i] = 0; + write_unlock_bh(&bond->lock); } else { pr_err("no command found in arp_ip_targets file for bond %s. Use +<addr> or -<addr>.\n", bond->dev->name); @@ -660,6 +705,7 @@ static ssize_t bonding_store_arp_targets(struct device *d, goto out; } + ret = count; out: return ret; } @@ -1635,6 +1681,7 @@ static struct attribute *per_bond_attrs[] = { &dev_attr_mode.attr, &dev_attr_fail_over_mac.attr, &dev_attr_arp_validate.attr, + &dev_attr_arp_all_targets.attr, &dev_attr_arp_interval.attr, &dev_attr_arp_ip_target.attr, &dev_attr_downdelay.attr, diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h index c990b42cf7a..3fb73cc8c34 100644 --- a/drivers/net/bonding/bonding.h +++ b/drivers/net/bonding/bonding.h @@ -144,6 +144,7 @@ struct bond_params { u8 num_peer_notif; int arp_interval; int arp_validate; + int arp_all_targets; int use_carrier; int fail_over_mac; int updelay; @@ -179,6 +180,7 @@ struct slave { int delay; unsigned long jiffies; unsigned long last_arp_rx; + unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS]; s8 link; /* one of BOND_LINK_XXXX */ s8 new_link; u8 backup:1, /* indicates backup slave. Value corresponds with @@ -322,6 +324,9 @@ static inline bool bond_is_active_slave(struct slave *slave) #define BOND_FOM_ACTIVE 1 #define BOND_FOM_FOLLOW 2 +#define BOND_ARP_TARGETS_ANY 0 +#define BOND_ARP_TARGETS_ALL 1 + #define BOND_ARP_VALIDATE_NONE 0 #define BOND_ARP_VALIDATE_ACTIVE (1 << BOND_STATE_ACTIVE) #define BOND_ARP_VALIDATE_BACKUP (1 << BOND_STATE_BACKUP) @@ -334,11 +339,31 @@ static inline int slave_do_arp_validate(struct bonding *bond, return bond->params.arp_validate & (1 << bond_slave_state(slave)); } +/* Get the oldest arp which we've received on this slave for bond's + * arp_targets. + */ +static inline unsigned long slave_oldest_target_arp_rx(struct bonding *bond, + struct slave *slave) +{ + int i = 1; + unsigned long ret = slave->target_last_arp_rx[0]; + + for (; (i < BOND_MAX_ARP_TARGETS) && bond->params.arp_targets[i]; i++) + if (time_before(slave->target_last_arp_rx[i], ret)) + ret = slave->target_last_arp_rx[i]; + + return ret; +} + static inline unsigned long slave_last_rx(struct bonding *bond, struct slave *slave) { - if (slave_do_arp_validate(bond, slave)) - return slave->last_arp_rx; + if (slave_do_arp_validate(bond, slave)) { + if (bond->params.arp_all_targets == BOND_ARP_TARGETS_ALL) + return slave_oldest_target_arp_rx(bond, slave); + else + return slave->last_arp_rx; + } return slave->dev->last_rx; } @@ -464,12 +489,29 @@ static inline struct slave *bond_slave_has_mac(struct bonding *bond, return NULL; } +/* Check if the ip is present in arp ip list, or first free slot if ip == 0 + * Returns -1 if not found, index if found + */ +static inline int bond_get_targets_ip(__be32 *targets, __be32 ip) +{ + int i; + + for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) + if (targets[i] == ip) + return i; + else if (targets[i] == 0) + break; + + return -1; +} + /* exported from bond_main.c */ extern int bond_net_id; extern const struct bond_parm_tbl bond_lacp_tbl[]; extern const struct bond_parm_tbl bond_mode_tbl[]; extern const struct bond_parm_tbl xmit_hashtype_tbl[]; extern const struct bond_parm_tbl arp_validate_tbl[]; +extern const struct bond_parm_tbl arp_all_targets_tbl[]; extern const struct bond_parm_tbl fail_over_mac_tbl[]; extern const struct bond_parm_tbl pri_reselect_tbl[]; extern struct bond_parm_tbl ad_select_tbl[]; diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig index a989669b353..2037080c504 100644 --- a/drivers/net/ethernet/Kconfig +++ b/drivers/net/ethernet/Kconfig @@ -24,6 +24,7 @@ source "drivers/net/ethernet/allwinner/Kconfig" source "drivers/net/ethernet/alteon/Kconfig" source "drivers/net/ethernet/amd/Kconfig" source "drivers/net/ethernet/apple/Kconfig" +source "drivers/net/ethernet/arc/Kconfig" source "drivers/net/ethernet/atheros/Kconfig" source "drivers/net/ethernet/cadence/Kconfig" source "drivers/net/ethernet/adi/Kconfig" diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile index 009da27b6e2..390bd0bfaa2 100644 --- a/drivers/net/ethernet/Makefile +++ b/drivers/net/ethernet/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_NET_VENDOR_ALLWINNER) += allwinner/ obj-$(CONFIG_NET_VENDOR_ALTEON) += alteon/ obj-$(CONFIG_NET_VENDOR_AMD) += amd/ obj-$(CONFIG_NET_VENDOR_APPLE) += apple/ +obj-$(CONFIG_NET_VENDOR_ARC) += arc/ obj-$(CONFIG_NET_VENDOR_ATHEROS) += atheros/ obj-$(CONFIG_NET_CADENCE) += cadence/ obj-$(CONFIG_NET_BFIN) += adi/ diff --git a/drivers/net/ethernet/apple/bmac.c b/drivers/net/ethernet/apple/bmac.c index 714dcfe3a46..a597b766f08 100644 --- a/drivers/net/ethernet/apple/bmac.c +++ b/drivers/net/ethernet/apple/bmac.c @@ -1016,7 +1016,6 @@ static void bmac_set_multicast(struct net_device *dev) static void bmac_set_multicast(struct net_device *dev) { struct netdev_hw_addr *ha; - int i; unsigned short rx_cfg; u32 crc; diff --git a/drivers/net/ethernet/arc/Kconfig b/drivers/net/ethernet/arc/Kconfig new file mode 100644 index 00000000000..514c57fd26f --- /dev/null +++ b/drivers/net/ethernet/arc/Kconfig @@ -0,0 +1,31 @@ +# +# ARC EMAC network device configuration +# + +config NET_VENDOR_ARC + bool "ARC devices" + default y + ---help--- + If you have a network (Ethernet) card belonging to this class, say Y + and read the Ethernet-HOWTO, available from + <http://www.tldp.org/docs.html#howto>. + + Note that the answer to this question doesn't directly affect the + kernel: saying N will just cause the configurator to skip all + the questions about ARC cards. If you say Y, you will be asked for + your specific card in the following questions. + +if NET_VENDOR_ARC + +config ARC_EMAC + tristate "ARC EMAC support" + select MII + select PHYLIB + depends on OF_IRQ + depends on OF_NET + ---help--- + On some legacy ARC (Synopsys) FPGA boards such as ARCAngel4/ML50x + non-standard on-chip ethernet device ARC EMAC 10/100 is used. + Say Y here if you have such a board. If unsure, say N. + +endif # NET_VENDOR_ARC diff --git a/drivers/net/ethernet/arc/Makefile b/drivers/net/ethernet/arc/Makefile new file mode 100644 index 00000000000..00c8657637d --- /dev/null +++ b/drivers/net/ethernet/arc/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for the ARC network device drivers. +# + +arc_emac-objs := emac_main.o emac_mdio.o +obj-$(CONFIG_ARC_EMAC) += arc_emac.o diff --git a/drivers/net/ethernet/arc/emac.h b/drivers/net/ethernet/arc/emac.h new file mode 100644 index 00000000000..dc08678bf9a --- /dev/null +++ b/drivers/net/ethernet/arc/emac.h @@ -0,0 +1,214 @@ +/* + * Copyright (C) 2004-2013 Synopsys, Inc. (www.synopsys.com) + * + * Registers and bits definitions of ARC EMAC + */ + +#ifndef ARC_EMAC_H +#define ARC_EMAC_H + +#include <linux/device.h> +#include <linux/dma-mapping.h> +#include <linux/netdevice.h> +#include <linux/phy.h> + +/* STATUS and ENABLE Register bit masks */ +#define TXINT_MASK (1<<0) /* Transmit interrupt */ +#define RXINT_MASK (1<<1) /* Receive interrupt */ +#define ERR_MASK (1<<2) /* Error interrupt */ +#define TXCH_MASK (1<<3) /* Transmit chaining error interrupt */ +#define MSER_MASK (1<<4) /* Missed packet counter error */ +#define RXCR_MASK (1<<8) /* RXCRCERR counter rolled over */ +#define RXFR_MASK (1<<9) /* RXFRAMEERR counter rolled over */ +#define RXFL_MASK (1<<10) /* RXOFLOWERR counter rolled over */ +#define MDIO_MASK (1<<12) /* MDIO complete interrupt */ +#define TXPL_MASK (1<<31) /* Force polling of BD by EMAC */ + +/* CONTROL Register bit masks */ +#define EN_MASK (1<<0) /* VMAC enable */ +#define TXRN_MASK (1<<3) /* TX enable */ +#define RXRN_MASK (1<<4) /* RX enable */ +#define DSBC_MASK (1<<8) /* Disable receive broadcast */ +#define ENFL_MASK (1<<10) /* Enable Full-duplex */ +#define PROM_MASK (1<<11) /* Promiscuous mode */ + +/* Buffer descriptor INFO bit masks */ +#define OWN_MASK (1<<31) /* 0-CPU owns buffer, 1-EMAC owns buffer */ +#define FIRST_MASK (1<<16) /* First buffer in chain */ +#define LAST_MASK (1<<17) /* Last buffer in chain */ +#define LEN_MASK 0x000007FF /* last 11 bits */ +#define CRLS (1<<21) +#define DEFR (1<<22) +#define DROP (1<<23) +#define RTRY (1<<24) +#define LTCL (1<<28) +#define UFLO (1<<29) + +#define FOR_EMAC OWN_MASK +#define FOR_CPU 0 + +/* ARC EMAC register set combines entries for MAC and MDIO */ +enum { + R_ID = 0, + R_STATUS, + R_ENABLE, + R_CTRL, + R_POLLRATE, + R_RXERR, + R_MISS, + R_TX_RING, + R_RX_RING, + R_ADDRL, + R_ADDRH, + R_LAFL, + R_LAFH, + R_MDIO, +}; + +#define TX_TIMEOUT (400*HZ/1000) /* Transmission timeout */ + +#define ARC_EMAC_NAPI_WEIGHT 40 /* Workload for NAPI */ + +#define EMAC_BUFFER_SIZE 1536 /* EMAC buffer size */ + +/** + * struct arc_emac_bd - EMAC buffer descriptor (BD). + * + * @info: Contains status information on the buffer itself. + * @data: 32-bit byte addressable pointer to the packet data. + */ +struct arc_emac_bd { + __le32 info; + dma_addr_t data; +}; + +/* Number of Rx/Tx BD's */ +#define RX_BD_NUM 128 +#define TX_BD_NUM 128 + +#define RX_RING_SZ (RX_BD_NUM * sizeof(struct arc_emac_bd)) +#define TX_RING_SZ (TX_BD_NUM * sizeof(struct arc_emac_bd)) + +/** + * struct buffer_state - Stores Rx/Tx buffer state. + * @sk_buff: Pointer to socket buffer. + * @addr: Start address of DMA-mapped memory region. + * @len: Length of DMA-mapped memory region. + */ +struct buffer_state { + struct sk_buff *skb; + DEFINE_DMA_UNMAP_ADDR(addr); + DEFINE_DMA_UNMAP_LEN(len); +}; + +/** + * struct arc_emac_priv - Storage of EMAC's private information. + * @dev: Pointer to the current device. + * @ndev: Pointer to the current network device. + * @phy_dev: Pointer to attached PHY device. + * @bus: Pointer to the current MII bus. + * @regs: Base address of EMAC memory-mapped control registers. + * @napi: Structure for NAPI. + * @stats: Network device statistics. + * @rxbd: Pointer to Rx BD ring. + * @txbd: Pointer to Tx BD ring. + * @rxbd_dma: DMA handle for Rx BD ring. + * @txbd_dma: DMA handle for Tx BD ring. + * @rx_buff: Storage for Rx buffers states. + * @tx_buff: Storage for Tx buffers states. + * @txbd_curr: Index of Tx BD to use on the next "ndo_start_xmit". + * @txbd_dirty: Index of Tx BD to free on the next Tx interrupt. + * @last_rx_bd: Index of the last Rx BD we've got from EMAC. + * @link: PHY's last seen link state. + * @duplex: PHY's last set duplex mode. + * @speed: PHY's last set speed. + * @max_speed: Maximum supported by current system network data-rate. + */ +struct arc_emac_priv { + /* Devices */ + struct device *dev; + struct net_device *ndev; + struct phy_device *phy_dev; + struct mii_bus *bus; + + void __iomem *regs; + + struct napi_struct napi; + struct net_device_stats stats; + + struct arc_emac_bd *rxbd; + struct arc_emac_bd *txbd; + + dma_addr_t rxbd_dma; + dma_addr_t txbd_dma; + + struct buffer_state rx_buff[RX_BD_NUM]; + struct buffer_state tx_buff[TX_BD_NUM]; + unsigned int txbd_curr; + unsigned int txbd_dirty; + + unsigned int last_rx_bd; + + unsigned int link; + unsigned int duplex; + unsigned int speed; + unsigned int max_speed; +}; + +/** + * arc_reg_set - Sets EMAC register with provided value. + * @priv: Pointer to ARC EMAC private data structure. + * @reg: Register offset from base address. + * @value: Value to set in register. + */ +static inline void arc_reg_set(struct arc_emac_priv *priv, int reg, int value) +{ + iowrite32(value, priv->regs + reg * sizeof(int)); +} + +/** + * arc_reg_get - Gets value of specified EMAC register. + * @priv: Pointer to ARC EMAC private data structure. + * @reg: Register offset from base address. + * + * returns: Value of requested register. + */ +static inline unsigned int arc_reg_get(struct arc_emac_priv *priv, int reg) +{ + return ioread32(priv->regs + reg * sizeof(int)); +} + +/** + * arc_reg_or - Applies mask to specified EMAC register - ("reg" | "mask"). + * @priv: Pointer to ARC EMAC private data structure. + * @reg: Register offset from base address. + * @mask: Mask to apply to specified register. + * + * This function reads initial register value, then applies provided mask + * to it and then writes register back. + */ +static inline void arc_reg_or(struct arc_emac_priv *priv, int reg, int mask) +{ + unsigned int value = arc_reg_get(priv, reg); + arc_reg_set(priv, reg, value | mask); +} + +/** + * arc_reg_clr - Applies mask to specified EMAC register - ("reg" & ~"mask"). + * @priv: Pointer to ARC EMAC private data structure. + * @reg: Register offset from base address. + * @mask: Mask to apply to specified register. + * + * This function reads initial register value, then applies provided mask + * to it and then writes register back. + */ +static inline void arc_reg_clr(struct arc_emac_priv *priv, int reg, int mask) +{ + unsigned int value = arc_reg_get(priv, reg); + arc_reg_set(priv, reg, value & ~mask); +} + +int arc_mdio_probe(struct platform_device *pdev, struct arc_emac_priv *priv); +int arc_mdio_remove(struct arc_emac_priv *priv); + +#endif /* ARC_EMAC_H */ diff --git a/drivers/net/ethernet/arc/emac_main.c b/drivers/net/ethernet/arc/emac_main.c new file mode 100644 index 00000000000..20345f6bf89 --- /dev/null +++ b/drivers/net/ethernet/arc/emac_main.c @@ -0,0 +1,806 @@ +/* + * Copyright (C) 2004-2013 Synopsys, Inc. (www.synopsys.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Driver for the ARC EMAC 10100 (hardware revision 5) + * + * Contributors: + * Amit Bhor + * Sameer Dhavale + * Vineet Gupta + */ + +#include <linux/etherdevice.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/of_address.h> +#include <linux/of_irq.h> +#include <linux/of_mdio.h> +#include <linux/of_net.h> +#include <linux/of_platform.h> + +#include "emac.h" + +#define DRV_NAME "arc_emac" +#define DRV_VERSION "1.0" + +/** + * arc_emac_adjust_link - Adjust the PHY link duplex. + * @ndev: Pointer to the net_device structure. + * + * This function is called to change the duplex setting after auto negotiation + * is done by the PHY. + */ +static void arc_emac_adjust_link(struct net_device *ndev) +{ + struct arc_emac_priv *priv = netdev_priv(ndev); + struct phy_device *phy_dev = priv->phy_dev; + unsigned int reg, state_changed = 0; + + if (priv->link != phy_dev->link) { + priv->link = phy_dev->link; + state_changed = 1; + } + + if (priv->speed != phy_dev->speed) { + priv->speed = phy_dev->speed; + state_changed = 1; + } + + if (priv->duplex != phy_dev->duplex) { + reg = arc_reg_get(priv, R_CTRL); + + if (DUPLEX_FULL == phy_dev->duplex) + reg |= ENFL_MASK; + else + reg &= ~ENFL_MASK; + + arc_reg_set(priv, R_CTRL, reg); + priv->duplex = phy_dev->duplex; + state_changed = 1; + } + + if (state_changed) + phy_print_status(phy_dev); +} + +/** + * arc_emac_get_settings - Get PHY settings. + * @ndev: Pointer to net_device structure. + * @cmd: Pointer to ethtool_cmd structure. + * + * This implements ethtool command for getting PHY settings. If PHY could + * not be found, the function returns -ENODEV. This function calls the + * relevant PHY ethtool API to get the PHY settings. + * Issue "ethtool ethX" under linux prompt to execute this function. + */ +static int arc_emac_get_settings(struct net_device *ndev, + struct ethtool_cmd *cmd) +{ + struct arc_emac_priv *priv = netdev_priv(ndev); + + return phy_ethtool_gset(priv->phy_dev, cmd); +} + +/** + * arc_emac_set_settings - Set PHY settings as passed in the argument. + * @ndev: Pointer to net_device structure. + * @cmd: Pointer to ethtool_cmd structure. + * + * This implements ethtool command for setting various PHY settings. If PHY + * could not be found, the function returns -ENODEV. This function calls the + * relevant PHY ethtool API to set the PHY. + * Issue e.g. "ethtool -s ethX speed 1000" under linux prompt to execute this + * function. + */ +static int arc_emac_set_settings(struct net_device *ndev, + struct ethtool_cmd *cmd) +{ + struct arc_emac_priv *priv = netdev_priv(ndev); + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return phy_ethtool_sset(priv->phy_dev, cmd); +} + +/** + * arc_emac_get_drvinfo - Get EMAC driver information. + * @ndev: Pointer to net_device structure. + * @info: Pointer to ethtool_drvinfo structure. + * + * This implements ethtool command for getting the driver information. + * Issue "ethtool -i ethX" under linux prompt to execute this function. + */ +static void arc_emac_get_drvinfo(struct net_device *ndev, + struct ethtool_drvinfo *info) +{ + strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); + strlcpy(info->version, DRV_VERSION, sizeof(info->version)); +} + +static const struct ethtool_ops arc_emac_ethtool_ops = { + .get_settings = arc_emac_get_settings, + .set_settings = arc_emac_set_settings, + .get_drvinfo = arc_emac_get_drvinfo, + .get_link = ethtool_op_get_link, +}; + +#define FIRST_OR_LAST_MASK (FIRST_MASK | LAST_MASK) + +/** + * arc_emac_tx_clean - clears processed by EMAC Tx BDs. + * @ndev: Pointer to the network device. + */ +static void arc_emac_tx_clean(struct net_device *ndev) +{ + struct arc_emac_priv *priv = netdev_priv(ndev); + struct net_device_stats *stats = &priv->stats; + unsigned int i; + + for (i = 0; i < TX_BD_NUM; i++) { + unsigned int *txbd_dirty = &priv->txbd_dirty; + struct arc_emac_bd *txbd = &priv->txbd[*txbd_dirty]; + struct buffer_state *tx_buff = &priv->tx_buff[*txbd_dirty]; + struct sk_buff *skb = tx_buff->skb; + unsigned int info = le32_to_cpu(txbd->info); + + *txbd_dirty = (*txbd_dirty + 1) % TX_BD_NUM; + + if ((info & FOR_EMAC) || !txbd->data) + break; + + if (unlikely(info & (DROP | DEFR | LTCL | UFLO))) { + stats->tx_errors++; + stats->tx_dropped++; + + if (info & DEFR) + stats->tx_carrier_errors++; + + if (info & LTCL) + stats->collisions++; + + if (info & UFLO) + stats->tx_fifo_errors++; + } else if (likely(info & FIRST_OR_LAST_MASK)) { + stats->tx_packets++; + stats->tx_bytes += skb->len; + } + + dma_unmap_single(&ndev->dev, dma_unmap_addr(&tx_buff, addr), + dma_unmap_len(&tx_buff, len), DMA_TO_DEVICE); + + /* return the sk_buff to system */ + dev_kfree_skb_irq(skb); + + txbd->data = 0; + txbd->info = 0; + + if (netif_queue_stopped(ndev)) + netif_wake_queue(ndev); + } +} + +/** + * arc_emac_rx - processing of Rx packets. + * @ndev: Pointer to the network device. + * @budget: How many BDs to process on 1 call. + * + * returns: Number of processed BDs + * + * Iterate through Rx BDs and deliver received packages to upper layer. + */ +static int arc_emac_rx(struct net_device *ndev, int budget) +{ + struct arc_emac_priv *priv = netdev_priv(ndev); + unsigned int work_done; + + for (work_done = 0; work_done <= budget; work_done++) { + unsigned int *last_rx_bd = &priv->last_rx_bd; + struct net_device_stats *stats = &priv->stats; + struct buffer_state *rx_buff = &priv->rx_buff[*last_rx_bd]; + struct arc_emac_bd *rxbd = &priv->rxbd[*last_rx_bd]; + unsigned int buflen = EMAC_BUFFER_SIZE; + unsigned int pktlen, info = le32_to_cpu(rxbd->info); + struct sk_buff *skb; + dma_addr_t addr; + + if (unlikely((info & OWN_MASK) == FOR_EMAC)) + break; + + /* Make a note that we saw a packet at this BD. + * So next time, driver starts from this + 1 + */ + *last_rx_bd = (*last_rx_bd + 1) % RX_BD_NUM; + + if (unlikely((info & FIRST_OR_LAST_MASK) != + FIRST_OR_LAST_MASK)) { + /* We pre-allocate buffers of MTU size so incoming + * packets won't be split/chained. + */ + if (net_ratelimit()) + netdev_err(ndev, "incomplete packet received\n"); + + /* Return ownership to EMAC */ + rxbd->info = cpu_to_le32(FOR_EMAC | buflen); + stats->rx_errors++; + stats->rx_length_errors++; + continue; + } + + pktlen = info & LEN_MASK; + stats->rx_packets++; + stats->rx_bytes += pktlen; + skb = rx_buff->skb; + skb_put(skb, pktlen); + skb->dev = ndev; + skb->protocol = eth_type_trans(skb, ndev); + + dma_unmap_single(&ndev->dev, dma_unmap_addr(&rx_buff, addr), + dma_unmap_len(&rx_buff, len), DMA_FROM_DEVICE); + + /* Prepare the BD for next cycle */ + rx_buff->skb = netdev_alloc_skb_ip_align(ndev, buflen); + if (unlikely(!rx_buff->skb)) { + stats->rx_errors++; + /* Because receive_skb is below, increment rx_dropped */ + stats->rx_dropped++; + continue; + } + + /* receive_skb only if new skb was allocated to avoid holes */ + netif_receive_skb(skb); + + addr = dma_map_single(&ndev->dev, (void *)rx_buff->skb->data, + buflen, DMA_FROM_DEVICE); + if (dma_mapping_error(&ndev->dev, addr)) { + if (net_ratelimit()) + netdev_err(ndev, "cannot dma map\n"); + dev_kfree_skb(rx_buff->skb); + stats->rx_errors++; + continue; + } + dma_unmap_addr_set(&rx_buff, mapping, addr); + dma_unmap_len_set(&rx_buff, len, buflen); + + rxbd->data = cpu_to_le32(rx_buff->skb->data); + + /* Make sure pointer to data buffer is set */ + wmb(); + + /* Return ownership to EMAC */ + rxbd->info = cpu_to_le32(FOR_EMAC | buflen); + } + + return work_done; +} + +/** + * arc_emac_poll - NAPI poll handler. + * @napi: Pointer to napi_struct structure. + * @budget: How many BDs to process on 1 call. + * + * returns: Number of processed BDs + */ +static int arc_emac_poll(struct napi_struct *napi, int budget) +{ + struct net_device *ndev = napi->dev; + struct arc_emac_priv *priv = netdev_priv(ndev); + unsigned int work_done; + + arc_emac_tx_clean(ndev); + + work_done = arc_emac_rx(ndev, budget); + if (work_done < budget) { + napi_complete(napi); + arc_reg_or(priv, R_ENABLE, RXINT_MASK); + } + + return work_done; +} + +/** + * arc_emac_intr - Global interrupt handler for EMAC. + * @irq: irq number. + * @dev_instance: device instance. + * + * returns: IRQ_HANDLED for all cases. + * + * ARC EMAC has only 1 interrupt line, and depending on bits raised in + * STATUS register we may tell what is a reason for interrupt to fire. + */ +static irqreturn_t arc_emac_intr(int irq, void *dev_instance) +{ + struct net_device *ndev = dev_instance; + struct arc_emac_priv *priv = netdev_priv(ndev); + struct net_device_stats *stats = &priv->stats; + unsigned int status; + + status = arc_reg_get(priv, R_STATUS); + status &= ~MDIO_MASK; + + /* Reset all flags except "MDIO complete" */ + arc_reg_set(priv, R_STATUS, status); + + if (status & RXINT_MASK) { + if (likely(napi_schedule_prep(&priv->napi))) { + arc_reg_clr(priv, R_ENABLE, RXINT_MASK); + __napi_schedule(&priv->napi); + } + } + + if (status & ERR_MASK) { + /* MSER/RXCR/RXFR/RXFL interrupt fires on corresponding + * 8-bit error counter overrun. + */ + + if (status & MSER_MASK) { + stats->rx_missed_errors += 0x100; + stats->rx_errors += 0x100; + } + + if (status & RXCR_MASK) { + stats->rx_crc_errors += 0x100; + stats->rx_errors += 0x100; + } + + if (status & RXFR_MASK) { + stats->rx_frame_errors += 0x100; + stats->rx_errors += 0x100; + } + + if (status & RXFL_MASK) { + stats->rx_over_errors += 0x100; + stats->rx_errors += 0x100; + } + } + + return IRQ_HANDLED; +} + +/** + * arc_emac_open - Open the network device. + * @ndev: Pointer to the network device. + * + * returns: 0, on success or non-zero error value on failure. + * + * This function sets the MAC address, requests and enables an IRQ + * for the EMAC device and starts the Tx queue. + * It also connects to the phy device. + */ +static int arc_emac_open(struct net_device *ndev) +{ + struct arc_emac_priv *priv = netdev_priv(ndev); + struct phy_device *phy_dev = priv->phy_dev; + struct arc_emac_bd *bd; + struct sk_buff *skb; + int i; + + phy_dev->autoneg = AUTONEG_ENABLE; + phy_dev->speed = 0; + phy_dev->duplex = 0; + phy_dev->advertising = phy_dev->supported; + + if (priv->max_speed > 100) { + phy_dev->advertising &= PHY_GBIT_FEATURES; + } else if (priv->max_speed <= 100) { + phy_dev->advertising &= PHY_BASIC_FEATURES; + if (priv->max_speed <= 10) { + phy_dev->advertising &= ~SUPPORTED_100baseT_Half; + phy_dev->advertising &= ~SUPPORTED_100baseT_Full; + } + } + + /* Allocate and set buffers for Rx BD's */ + bd = priv->rxbd; + for (i = 0; i < RX_BD_NUM; i++) { + skb = netdev_alloc_skb_ip_align(ndev, EMAC_BUFFER_SIZE); + if (unlikely(!skb)) + return -ENOMEM; + + priv->rx_buff[i].skb = skb; + bd->data = cpu_to_le32(skb->data); + + /* Make sure pointer to data buffer is set */ + wmb(); + + /* Set ownership to EMAC */ + bd->info = cpu_to_le32(FOR_EMAC | EMAC_BUFFER_SIZE); + bd++; + } + + priv->last_rx_bd = 0; + + /* Clean Tx BD's */ + memset(priv->txbd, 0, TX_RING_SZ); + + /* Initialize logical address filter */ + arc_reg_set(priv, R_LAFL, 0); + arc_reg_set(priv, R_LAFH, 0); + + /* Set BD ring pointers for device side */ + arc_reg_set(priv, R_RX_RING, (unsigned int)priv->rxbd_dma); + arc_reg_set(priv, R_TX_RING, (unsigned int)priv->txbd_dma); + + /* Enable interrupts */ + arc_reg_set(priv, R_ENABLE, RXINT_MASK | ERR_MASK); + + /* Set CONTROL */ + arc_reg_set(priv, R_CTRL, + (RX_BD_NUM << 24) | /* RX BD table length */ + (TX_BD_NUM << 16) | /* TX BD table length */ + TXRN_MASK | RXRN_MASK); + + napi_enable(&priv->napi); + + /* Enable EMAC */ + arc_reg_or(priv, R_CTRL, EN_MASK); + + phy_start_aneg(priv->phy_dev); + + netif_start_queue(ndev); + + return 0; +} + +/** + * arc_emac_stop - Close the network device. + * @ndev: Pointer to the network device. + * + * This function stops the Tx queue, disables interrupts and frees the IRQ for + * the EMAC device. + * It also disconnects the PHY device associated with the EMAC device. + */ +static int arc_emac_stop(struct net_device *ndev) +{ + struct arc_emac_priv *priv = netdev_priv(ndev); + + napi_disable(&priv->napi); + netif_stop_queue(ndev); + + /* Disable interrupts */ + arc_reg_clr(priv, R_ENABLE, RXINT_MASK | ERR_MASK); + + /* Disable EMAC */ + arc_reg_clr(priv, R_CTRL, EN_MASK); + + return 0; +} + +/** + * arc_emac_stats - Get system network statistics. + * @ndev: Pointer to net_device structure. + * + * Returns the address of the device statistics structure. + * Statistics are updated in interrupt handler. + */ +static struct net_device_stats *arc_emac_stats(struct net_device *ndev) +{ + struct arc_emac_priv *priv = netdev_priv(ndev); + struct net_device_stats *stats = &priv->stats; + unsigned long miss, rxerr; + u8 rxcrc, rxfram, rxoflow; + + rxerr = arc_reg_get(priv, R_RXERR); + miss = arc_reg_get(priv, R_MISS); + + rxcrc = rxerr; + rxfram = rxerr >> 8; + rxoflow = rxerr >> 16; + + stats->rx_errors += miss; + stats->rx_errors += rxcrc + rxfram + rxoflow; + + stats->rx_over_errors += rxoflow; + stats->rx_frame_errors += rxfram; + stats->rx_crc_errors += rxcrc; + stats->rx_missed_errors += miss; + + return stats; +} + +/** + * arc_emac_tx - Starts the data transmission. + * @skb: sk_buff pointer that contains data to be Transmitted. + * @ndev: Pointer to net_device structure. + * + * returns: NETDEV_TX_OK, on success + * NETDEV_TX_BUSY, if any of the descriptors are not free. + * + * This function is invoked from upper layers to initiate transmission. + */ +static int arc_emac_tx(struct sk_buff *skb, struct net_device *ndev) +{ + struct arc_emac_priv *priv = netdev_priv(ndev); + unsigned int len, *txbd_curr = &priv->txbd_curr; + struct net_device_stats *stats = &priv->stats; + __le32 *info = &priv->txbd[*txbd_curr].info; + dma_addr_t addr; + + if (skb_padto(skb, ETH_ZLEN)) + return NETDEV_TX_OK; + + len = max_t(unsigned int, ETH_ZLEN, skb->len); + + /* EMAC still holds this buffer in its possession. + * CPU must not modify this buffer descriptor + */ + if (unlikely((le32_to_cpu(*info) & OWN_MASK) == FOR_EMAC)) { + netif_stop_queue(ndev); + return NETDEV_TX_BUSY; + } + + addr = dma_map_single(&ndev->dev, (void *)skb->data, len, + DMA_TO_DEVICE); + + if (unlikely(dma_mapping_error(&ndev->dev, addr))) { + stats->tx_dropped++; + stats->tx_errors++; + dev_kfree_skb(skb); + return NETDEV_TX_OK; + } + dma_unmap_addr_set(&priv->tx_buff[*txbd_curr], mapping, addr); + dma_unmap_len_set(&priv->tx_buff[*txbd_curr], len, len); + + priv->tx_buff[*txbd_curr].skb = skb; + priv->txbd[*txbd_curr].data = cpu_to_le32(skb->data); + + /* Make sure pointer to data buffer is set */ + wmb(); + + *info = cpu_to_le32(FOR_EMAC | FIRST_OR_LAST_MASK | len); + + /* Increment index to point to the next BD */ + *txbd_curr = (*txbd_curr + 1) % TX_BD_NUM; + + /* Get "info" of the next BD */ + info = &priv->txbd[*txbd_curr].info; + + /* Check if if Tx BD ring is full - next BD is still owned by EMAC */ + if (unlikely((le32_to_cpu(*info) & OWN_MASK) == FOR_EMAC)) + netif_stop_queue(ndev); + + arc_reg_set(priv, R_STATUS, TXPL_MASK); + + skb_tx_timestamp(skb); + + return NETDEV_TX_OK; +} + +/** + * arc_emac_set_address - Set the MAC address for this device. + * @ndev: Pointer to net_device structure. + * @p: 6 byte Address to be written as MAC address. + * + * This function copies the HW address from the sockaddr structure to the + * net_device structure and updates the address in HW. + * + * returns: -EBUSY if the net device is busy or 0 if the address is set + * successfully. + */ +static int arc_emac_set_address(struct net_device *ndev, void *p) +{ + struct arc_emac_priv *priv = netdev_priv(ndev); + struct sockaddr *addr = p; + unsigned int addr_low, addr_hi; + + if (netif_running(ndev)) + return -EBUSY; + + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + + memcpy(ndev->dev_addr, addr->sa_data, ndev->addr_len); + + addr_low = le32_to_cpu(*(__le32 *) &ndev->dev_addr[0]); + addr_hi = le16_to_cpu(*(__le16 *) &ndev->dev_addr[4]); + + arc_reg_set(priv, R_ADDRL, addr_low); + arc_reg_set(priv, R_ADDRH, addr_hi); + + return 0; +} + +static const struct net_device_ops arc_emac_netdev_ops = { + .ndo_open = arc_emac_open, + .ndo_stop = arc_emac_stop, + .ndo_start_xmit = arc_emac_tx, + .ndo_set_mac_address = arc_emac_set_address, + .ndo_get_stats = arc_emac_stats, +}; + +static int arc_emac_probe(struct platform_device *pdev) +{ + struct resource res_regs, res_irq; + struct device_node *phy_node; + struct arc_emac_priv *priv; + struct net_device *ndev; + const char *mac_addr; + unsigned int id, clock_frequency; + int err; + + if (!pdev->dev.of_node) + return -ENODEV; + + /* Get PHY from device tree */ + phy_node = of_parse_phandle(pdev->dev.of_node, "phy", 0); + if (!phy_node) { + dev_err(&pdev->dev, "failed to retrieve phy description from device tree\n"); + return -ENODEV; + } + + /* Get EMAC registers base address from device tree */ + err = of_address_to_resource(pdev->dev.of_node, 0, &res_regs); + if (err) { + dev_err(&pdev->dev, "failed to retrieve registers base from device tree\n"); + return -ENODEV; + } + + /* Get CPU clock frequency from device tree */ + if (of_property_read_u32(pdev->dev.of_node, "clock-frequency", + &clock_frequency)) { + dev_err(&pdev->dev, "failed to retrieve <clock-frequency> from device tree\n"); + return -EINVAL; + } + + /* Get IRQ from device tree */ + err = of_irq_to_resource(pdev->dev.of_node, 0, &res_irq); + if (!err) { + dev_err(&pdev->dev, "failed to retrieve <irq> value from device tree\n"); + return -ENODEV; + } + + ndev = alloc_etherdev(sizeof(struct arc_emac_priv)); + if (!ndev) + return -ENOMEM; + + SET_NETDEV_DEV(ndev, &pdev->dev); + + ndev->netdev_ops = &arc_emac_netdev_ops; + ndev->ethtool_ops = &arc_emac_ethtool_ops; + ndev->watchdog_timeo = TX_TIMEOUT; + /* FIXME :: no multicast support yet */ + ndev->flags &= ~IFF_MULTICAST; + + priv = netdev_priv(ndev); + priv->dev = &pdev->dev; + priv->ndev = ndev; + + priv->regs = devm_ioremap_resource(&pdev->dev, &res_regs); + if (IS_ERR(priv->regs)) { + err = PTR_ERR(priv->regs); + goto out; + } + dev_dbg(&pdev->dev, "Registers base address is 0x%p\n", priv->regs); + + id = arc_reg_get(priv, R_ID); + + /* Check for EMAC revision 5 or 7, magic number */ + if (!(id == 0x0005fd02 || id == 0x0007fd02)) { + dev_err(&pdev->dev, "ARC EMAC not detected, id=0x%x\n", id); + err = -ENODEV; + goto out; + } + dev_info(&pdev->dev, "ARC EMAC detected with id: 0x%x\n", id); + + /* Set poll rate so that it polls every 1 ms */ + arc_reg_set(priv, R_POLLRATE, clock_frequency / 1000000); + + /* Get max speed of operation from device tree */ + if (of_property_read_u32(pdev->dev.of_node, "max-speed", + &priv->max_speed)) { + dev_err(&pdev->dev, "failed to retrieve <max-speed> from device tree\n"); + err = -EINVAL; + goto out; + } + + ndev->irq = res_irq.start; + dev_info(&pdev->dev, "IRQ is %d\n", ndev->irq); + + /* Register interrupt handler for device */ + err = devm_request_irq(&pdev->dev, ndev->irq, arc_emac_intr, 0, + ndev->name, ndev); + if (err) { + dev_err(&pdev->dev, "could not allocate IRQ\n"); + goto out; + } + + /* Get MAC address from device tree */ + mac_addr = of_get_mac_address(pdev->dev.of_node); + + if (!mac_addr || !is_valid_ether_addr(mac_addr)) + eth_hw_addr_random(ndev); + else + memcpy(ndev->dev_addr, mac_addr, ETH_ALEN); + + dev_info(&pdev->dev, "MAC address is now %pM\n", ndev->dev_addr); + + /* Do 1 allocation instead of 2 separate ones for Rx and Tx BD rings */ + priv->rxbd = dmam_alloc_coherent(&pdev->dev, RX_RING_SZ + TX_RING_SZ, + &priv->rxbd_dma, GFP_KERNEL); + + if (!priv->rxbd) { + dev_err(&pdev->dev, "failed to allocate data buffers\n"); + err = -ENOMEM; + goto out; + } + + priv->txbd = priv->rxbd + RX_BD_NUM; + + priv->txbd_dma = priv->rxbd_dma + RX_RING_SZ; + dev_dbg(&pdev->dev, "EMAC Device addr: Rx Ring [0x%x], Tx Ring[%x]\n", + (unsigned int)priv->rxbd_dma, (unsigned int)priv->txbd_dma); + + err = arc_mdio_probe(pdev, priv); + if (err) { + dev_err(&pdev->dev, "failed to probe MII bus\n"); + goto out; + } + + priv->phy_dev = of_phy_connect(ndev, phy_node, arc_emac_adjust_link, 0, + PHY_INTERFACE_MODE_MII); + if (!priv->phy_dev) { + dev_err(&pdev->dev, "of_phy_connect() failed\n"); + err = -ENODEV; + goto out; + } + + dev_info(&pdev->dev, "connected to %s phy with id 0x%x\n", + priv->phy_dev->drv->name, priv->phy_dev->phy_id); + + netif_napi_add(ndev, &priv->napi, arc_emac_poll, ARC_EMAC_NAPI_WEIGHT); + + err = register_netdev(ndev); + if (err) { + netif_napi_del(&priv->napi); + dev_err(&pdev->dev, "failed to register network device\n"); + goto out; + } + + return 0; + +out: + free_netdev(ndev); + return err; +} + +static int arc_emac_remove(struct platform_device *pdev) +{ + struct net_device *ndev = platform_get_drvdata(pdev); + struct arc_emac_priv *priv = netdev_priv(ndev); + + phy_disconnect(priv->phy_dev); + priv->phy_dev = NULL; + arc_mdio_remove(priv); + unregister_netdev(ndev); + netif_napi_del(&priv->napi); + free_netdev(ndev); + + return 0; +} + +static const struct of_device_id arc_emac_dt_ids[] = { + { .compatible = "snps,arc-emac" }, + { /* Sentinel */ } +}; +MODULE_DEVICE_TABLE(of, arc_emac_dt_ids); + +static struct platform_driver arc_emac_driver = { + .probe = arc_emac_probe, + .remove = arc_emac_remove, + .driver = { + .name = DRV_NAME, + .owner = THIS_MODULE, + .of_match_table = arc_emac_dt_ids, + }, +}; + +module_platform_driver(arc_emac_driver); + +MODULE_AUTHOR("Alexey Brodkin <abrodkin@synopsys.com>"); +MODULE_DESCRIPTION("ARC EMAC driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/arc/emac_mdio.c b/drivers/net/ethernet/arc/emac_mdio.c new file mode 100644 index 00000000000..26ba2423f33 --- /dev/null +++ b/drivers/net/ethernet/arc/emac_mdio.c @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2004-2013 Synopsys, Inc. (www.synopsys.com) + * + * MDIO implementation for ARC EMAC + */ + +#include <linux/delay.h> +#include <linux/of_mdio.h> +#include <linux/platform_device.h> + +#include "emac.h" + +/* Number of seconds we wait for "MDIO complete" flag to appear */ +#define ARC_MDIO_COMPLETE_POLL_COUNT 1 + +/** + * arc_mdio_complete_wait - Waits until MDIO transaction is completed. + * @priv: Pointer to ARC EMAC private data structure. + * + * returns: 0 on success, -ETIMEDOUT on a timeout. + */ +static int arc_mdio_complete_wait(struct arc_emac_priv *priv) +{ + unsigned int i; + + for (i = 0; i < ARC_MDIO_COMPLETE_POLL_COUNT * 40; i++) { + unsigned int status = arc_reg_get(priv, R_STATUS); + + status &= MDIO_MASK; + + if (status) { + /* Reset "MDIO complete" flag */ + arc_reg_set(priv, R_STATUS, status); + return 0; + } + + msleep(25); + } + + return -ETIMEDOUT; +} + +/** + * arc_mdio_read - MDIO interface read function. + * @bus: Pointer to MII bus structure. + * @phy_addr: Address of the PHY device. + * @reg_num: PHY register to read. + * + * returns: The register contents on success, -ETIMEDOUT on a timeout. + * + * Reads the contents of the requested register from the requested PHY + * address. + */ +static int arc_mdio_read(struct mii_bus *bus, int phy_addr, int reg_num) +{ + struct arc_emac_priv *priv = bus->priv; + unsigned int value; + int error; + + arc_reg_set(priv, R_MDIO, + 0x60020000 | (phy_addr << 23) | (reg_num << 18)); + + error = arc_mdio_complete_wait(priv); + if (error < 0) + return error; + + value = arc_reg_get(priv, R_MDIO) & 0xffff; + + dev_dbg(priv->dev, "arc_mdio_read(phy_addr=%i, reg_num=%x) = %x\n", + phy_addr, reg_num, value); + + return value; +} + +/** + * arc_mdio_write - MDIO interface write function. + * @bus: Pointer to MII bus structure. + * @phy_addr: Address of the PHY device. + * @reg_num: PHY register to write to. + * @value: Value to be written into the register. + * + * returns: 0 on success, -ETIMEDOUT on a timeout. + * + * Writes the value to the requested register. + */ +static int arc_mdio_write(struct mii_bus *bus, int phy_addr, + int reg_num, u16 value) +{ + struct arc_emac_priv *priv = bus->priv; + + dev_dbg(priv->dev, + "arc_mdio_write(phy_addr=%i, reg_num=%x, value=%x)\n", + phy_addr, reg_num, value); + + arc_reg_set(priv, R_MDIO, + 0x50020000 | (phy_addr << 23) | (reg_num << 18) | value); + + return arc_mdio_complete_wait(priv); +} + +/** + * arc_mdio_probe - MDIO probe function. + * @pdev: Pointer to platform device. + * @priv: Pointer to ARC EMAC private data structure. + * + * returns: 0 on success, -ENOMEM when mdiobus_alloc + * (to allocate memory for MII bus structure) fails. + * + * Sets up and registers the MDIO interface. + */ +int arc_mdio_probe(struct platform_device *pdev, struct arc_emac_priv *priv) +{ + struct mii_bus *bus; + int error; + + bus = mdiobus_alloc(); + if (!bus) + return -ENOMEM; + + priv->bus = bus; + bus->priv = priv; + bus->parent = priv->dev; + bus->name = "Synopsys MII Bus", + bus->read = &arc_mdio_read; + bus->write = &arc_mdio_write; + + snprintf(bus->id, MII_BUS_ID_SIZE, "%s", pdev->name); + + error = of_mdiobus_register(bus, pdev->dev.of_node); + if (error) { + dev_err(priv->dev, "cannot register MDIO bus %s\n", bus->name); + mdiobus_free(bus); + return error; + } + + return 0; +} + +/** + * arc_mdio_remove - MDIO remove function. + * @priv: Pointer to ARC EMAC private data structure. + * + * Unregisters the MDIO and frees any associate memory for MII bus. + */ +int arc_mdio_remove(struct arc_emac_priv *priv) +{ + mdiobus_unregister(priv->bus); + mdiobus_free(priv->bus); + priv->bus = NULL; + + return 0; +} diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index d342c5a34af..ec3aa1d451e 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -1722,7 +1722,7 @@ static int bnx2x_req_irq(struct bnx2x *bp) return request_irq(irq, bnx2x_interrupt, flags, bp->dev->name, bp->dev); } -int bnx2x_setup_irqs(struct bnx2x *bp) +static int bnx2x_setup_irqs(struct bnx2x *bp) { int rc = 0; if (bp->flags & USING_MSIX_FLAG && @@ -3543,9 +3543,12 @@ static void bnx2x_update_pbds_gso_enc(struct sk_buff *skb, /* outer IP header info */ if (xmit_type & XMIT_CSUM_V4) { struct iphdr *iph = ip_hdr(skb); + u16 csum = (__force u16)(~iph->check) - + (__force u16)iph->tot_len - + (__force u16)iph->frag_off; + pbd2->fw_ip_csum_wo_len_flags_frag = - bswab16(csum_fold((~iph->check) - - iph->tot_len - iph->frag_off)); + bswab16(csum_fold((__force __wsum)csum)); } else { pbd2->fw_ip_hdr_to_payload_w = hlen_w - ((sizeof(struct ipv6hdr)) >> 1); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c index 7c6faebb183..b8c067d1a0f 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c @@ -1391,7 +1391,7 @@ static bool bnx2x_is_nvm_accessible(struct bnx2x *bp) bp->pm_cap + PCI_PM_CTRL, &pm); if ((rc && !netif_running(dev)) || - (!rc && ((pm & PCI_PM_CTRL_STATE_MASK) != PCI_D0))) + (!rc && ((pm & PCI_PM_CTRL_STATE_MASK) != (__force u16)PCI_D0))) return false; return true; @@ -1610,8 +1610,10 @@ static int bnx2x_nvram_write1(struct bnx2x *bp, u32 offset, u8 *data_buf, */ val = be32_to_cpu(val_be); - val &= ~le32_to_cpu(0xff << BYTE_OFFSET(offset)); - val |= le32_to_cpu(*data_buf << BYTE_OFFSET(offset)); + val &= ~le32_to_cpu((__force __le32) + (0xff << BYTE_OFFSET(offset))); + val |= le32_to_cpu((__force __le32) + (*data_buf << BYTE_OFFSET(offset))); rc = bnx2x_nvram_write_dword(bp, align_offset, val, cmd_flags); diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index 73189888766..740518bbcb5 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -615,7 +615,7 @@ void bnx2x_read_dmae(struct bnx2x *bp, u32 src_addr, u32 len32) if (rc) { BNX2X_ERR("DMAE returned failure %d\n", rc); bnx2x_panic(); - }; + } } static void bnx2x_write_dmae_phys_len(struct bnx2x *bp, dma_addr_t phys_addr, diff --git a/drivers/net/ethernet/korina.c b/drivers/net/ethernet/korina.c index 64646eb39e8..270e65f2110 100644 --- a/drivers/net/ethernet/korina.c +++ b/drivers/net/ethernet/korina.c @@ -483,7 +483,6 @@ static void korina_multicast_list(struct net_device *dev) unsigned long flags; struct netdev_hw_addr *ha; u32 recognise = ETH_ARC_AB; /* always accept broadcasts */ - int i; /* Set promiscuous mode */ if (dev->flags & IFF_PROMISC) diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index ea1e0389635..df04c8206eb 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -257,6 +257,8 @@ static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op, if (!wait_for_completion_timeout(&context->done, msecs_to_jiffies(timeout))) { + mlx4_warn(dev, "communication channel command 0x%x timed out\n", + op); err = -EBUSY; goto out; } @@ -486,6 +488,8 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param, } if (cmd_pending(dev)) { + mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n", + op); err = -ETIMEDOUT; goto out; } @@ -549,6 +553,8 @@ static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param, if (!wait_for_completion_timeout(&context->done, msecs_to_jiffies(timeout))) { + mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n", + op); err = -EBUSY; goto out; } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c b/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c index 0f91222ea3d..9d4a1ea030d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c @@ -207,9 +207,6 @@ static int mlx4_en_dcbnl_ieee_getmaxrate(struct net_device *dev, struct mlx4_en_priv *priv = netdev_priv(dev); int i; - if (!priv->maxrate) - return -EINVAL; - for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) maxrate->tc_maxrate[i] = priv->maxrate[i] * MLX4_RATELIMIT_UNITS_IN_KB; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c index a5c9df07a7d..a071cda2dd0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c @@ -310,7 +310,7 @@ static void *mlx4_en_add(struct mlx4_dev *dev) err_mr: (void) mlx4_mr_free(dev, &mdev->mr); err_map: - if (!mdev->uar_map) + if (mdev->uar_map) iounmap(mdev->uar_map); err_uar: mlx4_uar_free(dev, &mdev->priv_uar); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 7299ada876c..caf20477056 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -405,7 +405,7 @@ static int mlx4_en_vlan_rx_add_vid(struct net_device *dev, en_err(priv, "Failed configuring VLAN filter\n"); } if (mlx4_register_vlan(mdev->dev, priv->port, vid, &idx)) - en_err(priv, "failed adding vlan %d\n", vid); + en_dbg(HW, priv, "failed adding vlan %d\n", vid); mutex_unlock(&mdev->state_lock); return 0; @@ -428,7 +428,7 @@ static int mlx4_en_vlan_rx_kill_vid(struct net_device *dev, if (!mlx4_find_cached_vlan(mdev->dev, priv->port, vid, &idx)) mlx4_unregister_vlan(mdev->dev, priv->port, idx); else - en_err(priv, "could not find vid %d in cache\n", vid); + en_dbg(HW, priv, "could not find vid %d in cache\n", vid); if (mdev->device_up && priv->port_up) { err = mlx4_SET_VLAN_FLTR(mdev->dev, priv); @@ -1236,10 +1236,19 @@ static void mlx4_en_tx_timeout(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; + int i; if (netif_msg_timer(priv)) en_warn(priv, "Tx timeout called on port:%d\n", priv->port); + for (i = 0; i < priv->tx_ring_num; i++) { + if (!netif_tx_queue_stopped(netdev_get_tx_queue(dev, i))) + continue; + en_warn(priv, "TX timeout on queue: %d, QP: 0x%x, CQ: 0x%x, Cons: 0x%x, Prod: 0x%x\n", + i, priv->tx_ring[i].qpn, priv->tx_ring[i].cqn, + priv->tx_ring[i].cons, priv->tx_ring[i].prod); + } + priv->port_stats.tx_timeout++; en_dbg(DRV, priv, "Scheduling watchdog\n"); queue_work(mdev->workqueue, &priv->watchdog_task); @@ -1375,12 +1384,13 @@ static void mlx4_en_do_get_stats(struct work_struct *work) mutex_lock(&mdev->state_lock); if (mdev->device_up) { - err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0); - if (err) - en_dbg(HW, priv, "Could not update stats\n"); + if (priv->port_up) { + err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0); + if (err) + en_dbg(HW, priv, "Could not update stats\n"); - if (priv->port_up) mlx4_en_auto_moderation(priv); + } queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY); } @@ -1634,6 +1644,9 @@ void mlx4_en_stop_port(struct net_device *dev, int detach) return; } + /* close port*/ + mlx4_CLOSE_PORT(mdev->dev, priv->port); + /* Synchronize with tx routine */ netif_tx_lock_bh(dev); if (detach) @@ -1734,14 +1747,11 @@ void mlx4_en_stop_port(struct net_device *dev, int detach) } local_bh_enable(); - mlx4_en_deactivate_rx_ring(priv, &priv->rx_ring[i]); while (test_bit(NAPI_STATE_SCHED, &cq->napi.state)) msleep(1); + mlx4_en_deactivate_rx_ring(priv, &priv->rx_ring[i]); mlx4_en_deactivate_cq(priv, cq); } - - /* close port*/ - mlx4_CLOSE_PORT(mdev->dev, priv->port); } static void mlx4_en_restart(struct work_struct *work) @@ -2322,6 +2332,8 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, mdev->pndev[port] = dev; netif_carrier_off(dev); + mlx4_en_set_default_moderation(priv); + err = register_netdev(dev); if (err) { en_err(priv, "Netdev registration failed for port %d\n", port); @@ -2353,7 +2365,6 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, en_err(priv, "Failed Initializing port\n"); goto out; } - mlx4_en_set_default_moderation(priv); queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY); if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 9c57581b021..76997b93fdf 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -43,40 +43,64 @@ #include "mlx4_en.h" +static int mlx4_alloc_pages(struct mlx4_en_priv *priv, + struct mlx4_en_rx_alloc *page_alloc, + const struct mlx4_en_frag_info *frag_info, + gfp_t _gfp) +{ + int order; + struct page *page; + dma_addr_t dma; + + for (order = MLX4_EN_ALLOC_PREFER_ORDER; ;) { + gfp_t gfp = _gfp; + + if (order) + gfp |= __GFP_COMP | __GFP_NOWARN; + page = alloc_pages(gfp, order); + if (likely(page)) + break; + if (--order < 0 || + ((PAGE_SIZE << order) < frag_info->frag_size)) + return -ENOMEM; + } + dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE << order, + PCI_DMA_FROMDEVICE); + if (dma_mapping_error(priv->ddev, dma)) { + put_page(page); + return -ENOMEM; + } + page_alloc->size = PAGE_SIZE << order; + page_alloc->page = page; + page_alloc->dma = dma; + page_alloc->offset = frag_info->frag_align; + /* Not doing get_page() for each frag is a big win + * on asymetric workloads. + */ + atomic_set(&page->_count, page_alloc->size / frag_info->frag_stride); + return 0; +} + static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv, struct mlx4_en_rx_desc *rx_desc, struct mlx4_en_rx_alloc *frags, - struct mlx4_en_rx_alloc *ring_alloc) + struct mlx4_en_rx_alloc *ring_alloc, + gfp_t gfp) { struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS]; - struct mlx4_en_frag_info *frag_info; + const struct mlx4_en_frag_info *frag_info; struct page *page; dma_addr_t dma; int i; for (i = 0; i < priv->num_frags; i++) { frag_info = &priv->frag_info[i]; - if (ring_alloc[i].offset == frag_info->last_offset) { - page = alloc_pages(GFP_ATOMIC | __GFP_COMP, - MLX4_EN_ALLOC_ORDER); - if (!page) - goto out; - dma = dma_map_page(priv->ddev, page, 0, - MLX4_EN_ALLOC_SIZE, PCI_DMA_FROMDEVICE); - if (dma_mapping_error(priv->ddev, dma)) { - put_page(page); - goto out; - } - page_alloc[i].page = page; - page_alloc[i].dma = dma; - page_alloc[i].offset = frag_info->frag_align; - } else { - page_alloc[i].page = ring_alloc[i].page; - get_page(ring_alloc[i].page); - page_alloc[i].dma = ring_alloc[i].dma; - page_alloc[i].offset = ring_alloc[i].offset + - frag_info->frag_stride; - } + page_alloc[i] = ring_alloc[i]; + page_alloc[i].offset += frag_info->frag_stride; + if (page_alloc[i].offset + frag_info->frag_stride <= ring_alloc[i].size) + continue; + if (mlx4_alloc_pages(priv, &page_alloc[i], frag_info, gfp)) + goto out; } for (i = 0; i < priv->num_frags; i++) { @@ -88,14 +112,16 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv, return 0; - out: while (i--) { frag_info = &priv->frag_info[i]; - if (ring_alloc[i].offset == frag_info->last_offset) + if (page_alloc[i].page != ring_alloc[i].page) { dma_unmap_page(priv->ddev, page_alloc[i].dma, - MLX4_EN_ALLOC_SIZE, PCI_DMA_FROMDEVICE); - put_page(page_alloc[i].page); + page_alloc[i].size, PCI_DMA_FROMDEVICE); + page = page_alloc[i].page; + atomic_set(&page->_count, 1); + put_page(page); + } } return -ENOMEM; } @@ -104,12 +130,12 @@ static void mlx4_en_free_frag(struct mlx4_en_priv *priv, struct mlx4_en_rx_alloc *frags, int i) { - struct mlx4_en_frag_info *frag_info = &priv->frag_info[i]; + const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i]; - if (frags[i].offset == frag_info->last_offset) { - dma_unmap_page(priv->ddev, frags[i].dma, MLX4_EN_ALLOC_SIZE, + if (frags[i].offset + frag_info->frag_stride > frags[i].size) + dma_unmap_page(priv->ddev, frags[i].dma, frags[i].size, PCI_DMA_FROMDEVICE); - } + if (frags[i].page) put_page(frags[i].page); } @@ -117,35 +143,28 @@ static void mlx4_en_free_frag(struct mlx4_en_priv *priv, static int mlx4_en_init_allocator(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring) { - struct mlx4_en_rx_alloc *page_alloc; int i; + struct mlx4_en_rx_alloc *page_alloc; for (i = 0; i < priv->num_frags; i++) { - page_alloc = &ring->page_alloc[i]; - page_alloc->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, - MLX4_EN_ALLOC_ORDER); - if (!page_alloc->page) - goto out; + const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i]; - page_alloc->dma = dma_map_page(priv->ddev, page_alloc->page, 0, - MLX4_EN_ALLOC_SIZE, PCI_DMA_FROMDEVICE); - if (dma_mapping_error(priv->ddev, page_alloc->dma)) { - put_page(page_alloc->page); - page_alloc->page = NULL; + if (mlx4_alloc_pages(priv, &ring->page_alloc[i], + frag_info, GFP_KERNEL)) goto out; - } - page_alloc->offset = priv->frag_info[i].frag_align; - en_dbg(DRV, priv, "Initialized allocator:%d with page:%p\n", - i, page_alloc->page); } return 0; out: while (i--) { + struct page *page; + page_alloc = &ring->page_alloc[i]; dma_unmap_page(priv->ddev, page_alloc->dma, - MLX4_EN_ALLOC_SIZE, PCI_DMA_FROMDEVICE); - put_page(page_alloc->page); + page_alloc->size, PCI_DMA_FROMDEVICE); + page = page_alloc->page; + atomic_set(&page->_count, 1); + put_page(page); page_alloc->page = NULL; } return -ENOMEM; @@ -158,13 +177,18 @@ static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv, int i; for (i = 0; i < priv->num_frags; i++) { + const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i]; + page_alloc = &ring->page_alloc[i]; en_dbg(DRV, priv, "Freeing allocator:%d count:%d\n", i, page_count(page_alloc->page)); dma_unmap_page(priv->ddev, page_alloc->dma, - MLX4_EN_ALLOC_SIZE, PCI_DMA_FROMDEVICE); - put_page(page_alloc->page); + page_alloc->size, PCI_DMA_FROMDEVICE); + while (page_alloc->offset + frag_info->frag_stride < page_alloc->size) { + put_page(page_alloc->page); + page_alloc->offset += frag_info->frag_stride; + } page_alloc->page = NULL; } } @@ -195,13 +219,14 @@ static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv, } static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv, - struct mlx4_en_rx_ring *ring, int index) + struct mlx4_en_rx_ring *ring, int index, + gfp_t gfp) { struct mlx4_en_rx_desc *rx_desc = ring->buf + (index * ring->stride); struct mlx4_en_rx_alloc *frags = ring->rx_info + (index << priv->log_rx_info); - return mlx4_en_alloc_frags(priv, rx_desc, frags, ring->page_alloc); + return mlx4_en_alloc_frags(priv, rx_desc, frags, ring->page_alloc, gfp); } static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring) @@ -235,7 +260,8 @@ static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv) ring = &priv->rx_ring[ring_ind]; if (mlx4_en_prepare_rx_desc(priv, ring, - ring->actual_size)) { + ring->actual_size, + GFP_KERNEL)) { if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) { en_err(priv, "Failed to allocate " "enough rx buffers\n"); @@ -450,11 +476,11 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv, DMA_FROM_DEVICE); /* Save page reference in skb */ - get_page(frags[nr].page); __skb_frag_set_page(&skb_frags_rx[nr], frags[nr].page); skb_frag_size_set(&skb_frags_rx[nr], frag_info->frag_size); skb_frags_rx[nr].page_offset = frags[nr].offset; skb->truesize += frag_info->frag_stride; + frags[nr].page = NULL; } /* Adjust size of last fragment to match actual length */ if (nr > 0) @@ -547,7 +573,7 @@ static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv, int index = ring->prod & ring->size_mask; while ((u32) (ring->prod - ring->cons) < ring->actual_size) { - if (mlx4_en_prepare_rx_desc(priv, ring, index)) + if (mlx4_en_prepare_rx_desc(priv, ring, index, GFP_ATOMIC)) break; ring->prod++; index = ring->prod & ring->size_mask; @@ -805,21 +831,7 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget) return done; } - -/* Calculate the last offset position that accommodates a full fragment - * (assuming fagment size = stride-align) */ -static int mlx4_en_last_alloc_offset(struct mlx4_en_priv *priv, u16 stride, u16 align) -{ - u16 res = MLX4_EN_ALLOC_SIZE % stride; - u16 offset = MLX4_EN_ALLOC_SIZE - stride - res + align; - - en_dbg(DRV, priv, "Calculated last offset for stride:%d align:%d " - "res:%d offset:%d\n", stride, align, res, offset); - return offset; -} - - -static int frag_sizes[] = { +static const int frag_sizes[] = { FRAG_SZ0, FRAG_SZ1, FRAG_SZ2, @@ -847,9 +859,6 @@ void mlx4_en_calc_rx_buf(struct net_device *dev) priv->frag_info[i].frag_stride = ALIGN(frag_sizes[i], SMP_CACHE_BYTES); } - priv->frag_info[i].last_offset = mlx4_en_last_alloc_offset( - priv, priv->frag_info[i].frag_stride, - priv->frag_info[i].frag_align); buf_size += priv->frag_info[i].frag_size; i++; } @@ -861,13 +870,13 @@ void mlx4_en_calc_rx_buf(struct net_device *dev) en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d " "num_frags:%d):\n", eff_mtu, priv->num_frags); for (i = 0; i < priv->num_frags; i++) { - en_dbg(DRV, priv, " frag:%d - size:%d prefix:%d align:%d " - "stride:%d last_offset:%d\n", i, - priv->frag_info[i].frag_size, - priv->frag_info[i].frag_prefix_size, - priv->frag_info[i].frag_align, - priv->frag_info[i].frag_stride, - priv->frag_info[i].last_offset); + en_err(priv, + " frag:%d - size:%d prefix:%d align:%d stride:%d\n", + i, + priv->frag_info[i].frag_size, + priv->frag_info[i].frag_prefix_size, + priv->frag_info[i].frag_align, + priv->frag_info[i].frag_stride); } } diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 2f4a26039e8..56160a2bb57 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -839,11 +839,11 @@ static ssize_t set_port_ib_mtu(struct device *dev, return -EINVAL; } - err = sscanf(buf, "%d", &mtu); - if (err > 0) + err = kstrtoint(buf, 0, &mtu); + if (!err) ibta_mtu = int_to_ibta_mtu(mtu); - if (err <= 0 || ibta_mtu < 0) { + if (err || ibta_mtu < 0) { mlx4_err(mdev, "%s is invalid IBTA mtu\n", buf); return -EINVAL; } @@ -2077,6 +2077,11 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data) num_vfs, MLX4_MAX_NUM_VF); return -EINVAL; } + + if (num_vfs < 0) { + pr_err("num_vfs module parameter cannot be negative\n"); + return -EINVAL; + } /* * Check for BARs. */ diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 57192a8f1d5..35fb60e2320 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -96,7 +96,8 @@ /* Use the maximum between 16384 and a single page */ #define MLX4_EN_ALLOC_SIZE PAGE_ALIGN(16384) -#define MLX4_EN_ALLOC_ORDER get_order(MLX4_EN_ALLOC_SIZE) + +#define MLX4_EN_ALLOC_PREFER_ORDER PAGE_ALLOC_COSTLY_ORDER /* Receive fragment sizes; we use at most 3 fragments (for 9600 byte MTU * and 4K allocations) */ @@ -234,9 +235,10 @@ struct mlx4_en_tx_desc { #define MLX4_EN_CX3_HIGH_ID 0x1005 struct mlx4_en_rx_alloc { - struct page *page; - dma_addr_t dma; - u16 offset; + struct page *page; + dma_addr_t dma; + u32 offset; + u32 size; }; struct mlx4_en_tx_ring { @@ -439,8 +441,6 @@ struct mlx4_en_frag_info { u16 frag_prefix_size; u16 frag_stride; u16 frag_align; - u16 last_offset; - }; #ifdef CONFIG_MLX4_EN_DCB diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 46cc11d5e20..e7284a2caff 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -21,8 +21,8 @@ #include <linux/ethtool.h> #include <linux/topology.h> #include <linux/gfp.h> -#include <linux/cpu_rmap.h> #include <linux/aer.h> +#include <linux/interrupt.h> #include "net_driver.h" #include "efx.h" #include "nic.h" @@ -1283,29 +1283,6 @@ static unsigned int efx_wanted_parallelism(struct efx_nic *efx) return count; } -static int -efx_init_rx_cpu_rmap(struct efx_nic *efx, struct msix_entry *xentries) -{ -#ifdef CONFIG_RFS_ACCEL - unsigned int i; - int rc; - - efx->net_dev->rx_cpu_rmap = alloc_irq_cpu_rmap(efx->n_rx_channels); - if (!efx->net_dev->rx_cpu_rmap) - return -ENOMEM; - for (i = 0; i < efx->n_rx_channels; i++) { - rc = irq_cpu_rmap_add(efx->net_dev->rx_cpu_rmap, - xentries[i].vector); - if (rc) { - free_irq_cpu_rmap(efx->net_dev->rx_cpu_rmap); - efx->net_dev->rx_cpu_rmap = NULL; - return rc; - } - } -#endif - return 0; -} - /* Probe the number and type of interrupts we are able to obtain, and * the resulting numbers of channels and RX queues. */ @@ -1359,11 +1336,6 @@ static int efx_probe_interrupts(struct efx_nic *efx) efx->n_tx_channels = n_channels; efx->n_rx_channels = n_channels; } - rc = efx_init_rx_cpu_rmap(efx, xentries); - if (rc) { - pci_disable_msix(efx->pci_dev); - return rc; - } for (i = 0; i < efx->n_channels; i++) efx_get_channel(efx, i)->irq = xentries[i].vector; @@ -1427,6 +1399,10 @@ static void efx_start_interrupts(struct efx_nic *efx, bool may_keep_eventq) BUG_ON(efx->state == STATE_DISABLED); + if (efx->eeh_disabled_legacy_irq) { + enable_irq(efx->legacy_irq); + efx->eeh_disabled_legacy_irq = false; + } if (efx->legacy_irq) efx->legacy_irq_enabled = true; efx_nic_enable_interrupts(efx); @@ -2365,7 +2341,7 @@ out: * Returns 0 if the recovery mechanisms are unsuccessful. * Returns a non-zero value otherwise. */ -static int efx_try_recovery(struct efx_nic *efx) +int efx_try_recovery(struct efx_nic *efx) { #ifdef CONFIG_EEH /* A PCI error can occur and not be seen by EEH because nothing @@ -2603,10 +2579,6 @@ static void efx_pci_remove_main(struct efx_nic *efx) BUG_ON(efx->state == STATE_READY); cancel_work_sync(&efx->reset_work); -#ifdef CONFIG_RFS_ACCEL - free_irq_cpu_rmap(efx->net_dev->rx_cpu_rmap); - efx->net_dev->rx_cpu_rmap = NULL; -#endif efx_stop_interrupts(efx, false); efx_nic_fini_interrupt(efx); efx_fini_port(efx); diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h index 8372da239b4..bdb30bbb0c9 100644 --- a/drivers/net/ethernet/sfc/efx.h +++ b/drivers/net/ethernet/sfc/efx.h @@ -124,6 +124,7 @@ extern const struct ethtool_ops efx_ethtool_ops; extern int efx_reset(struct efx_nic *efx, enum reset_type method); extern void efx_reset_down(struct efx_nic *efx, enum reset_type method); extern int efx_reset_up(struct efx_nic *efx, enum reset_type method, bool ok); +extern int efx_try_recovery(struct efx_nic *efx); /* Global */ extern void efx_schedule_reset(struct efx_nic *efx, enum reset_type type); diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index 6e768175e7e..1fc21458413 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -1114,6 +1114,20 @@ static int efx_ethtool_set_rxfh_indir(struct net_device *net_dev, return 0; } +int efx_ethtool_get_ts_info(struct net_device *net_dev, + struct ethtool_ts_info *ts_info) +{ + struct efx_nic *efx = netdev_priv(net_dev); + + /* Software capabilities */ + ts_info->so_timestamping = (SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE); + ts_info->phc_index = -1; + + efx_ptp_get_ts_info(efx, ts_info); + return 0; +} + static int efx_ethtool_get_module_eeprom(struct net_device *net_dev, struct ethtool_eeprom *ee, u8 *data) @@ -1176,7 +1190,7 @@ const struct ethtool_ops efx_ethtool_ops = { .get_rxfh_indir_size = efx_ethtool_get_rxfh_indir_size, .get_rxfh_indir = efx_ethtool_get_rxfh_indir, .set_rxfh_indir = efx_ethtool_set_rxfh_indir, - .get_ts_info = efx_ptp_get_ts_info, + .get_ts_info = efx_ethtool_get_ts_info, .get_module_info = efx_ethtool_get_module_info, .get_module_eeprom = efx_ethtool_get_module_eeprom, }; diff --git a/drivers/net/ethernet/sfc/filter.c b/drivers/net/ethernet/sfc/filter.c index 2397f0e8d3e..b74a60ab9ac 100644 --- a/drivers/net/ethernet/sfc/filter.c +++ b/drivers/net/ethernet/sfc/filter.c @@ -1185,8 +1185,21 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb, nhoff = skb_network_offset(skb); - if (skb->protocol != htons(ETH_P_IP)) + if (skb->protocol == htons(ETH_P_8021Q)) { + EFX_BUG_ON_PARANOID(skb_headlen(skb) < + nhoff + sizeof(struct vlan_hdr)); + if (((const struct vlan_hdr *)skb->data + nhoff)-> + h_vlan_encapsulated_proto != htons(ETH_P_IP)) + return -EPROTONOSUPPORT; + + /* This is IP over 802.1q VLAN. We can't filter on the + * IP 5-tuple and the vlan together, so just strip the + * vlan header and filter on the IP part. + */ + nhoff += sizeof(struct vlan_hdr); + } else if (skb->protocol != htons(ETH_P_IP)) { return -EPROTONOSUPPORT; + } /* RFS must validate the IP header length before calling us */ EFX_BUG_ON_PARANOID(skb_headlen(skb) < nhoff + sizeof(*ip)); diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index 9a2914cfd34..f4c7e6b6774 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -243,6 +243,7 @@ struct efx_rx_buffer { #define EFX_RX_BUF_LAST_IN_PAGE 0x0001 #define EFX_RX_PKT_CSUMMED 0x0002 #define EFX_RX_PKT_DISCARD 0x0004 +#define EFX_RX_PKT_TCP 0x0040 /** * struct efx_rx_page_state - Page-based rx buffer state @@ -788,6 +789,7 @@ struct efx_nic { const struct efx_nic_type *type; int legacy_irq; bool legacy_irq_enabled; + bool eeh_disabled_legacy_irq; struct workqueue_struct *workqueue; char workqueue_name[16]; struct work_struct reset_work; diff --git a/drivers/net/ethernet/sfc/nic.c b/drivers/net/ethernet/sfc/nic.c index b0503cd8c2a..56ed3bc71e0 100644 --- a/drivers/net/ethernet/sfc/nic.c +++ b/drivers/net/ethernet/sfc/nic.c @@ -14,6 +14,7 @@ #include <linux/pci.h> #include <linux/module.h> #include <linux/seq_file.h> +#include <linux/cpu_rmap.h> #include "net_driver.h" #include "bitfield.h" #include "efx.h" @@ -1080,12 +1081,21 @@ efx_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event) rx_ev_hdr_type = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_HDR_TYPE); if (likely(rx_ev_pkt_ok)) { - /* If packet is marked as OK and packet type is TCP/IP or - * UDP/IP, then we can rely on the hardware checksum. + /* If packet is marked as OK then we can rely on the + * hardware checksum and classification. */ - flags = (rx_ev_hdr_type == FSE_CZ_RX_EV_HDR_TYPE_IPV4V6_TCP || - rx_ev_hdr_type == FSE_CZ_RX_EV_HDR_TYPE_IPV4V6_UDP) ? - EFX_RX_PKT_CSUMMED : 0; + flags = 0; + switch (rx_ev_hdr_type) { + case FSE_CZ_RX_EV_HDR_TYPE_IPV4V6_TCP: + flags |= EFX_RX_PKT_TCP; + /* fall through */ + case FSE_CZ_RX_EV_HDR_TYPE_IPV4V6_UDP: + flags |= EFX_RX_PKT_CSUMMED; + /* fall through */ + case FSE_CZ_RX_EV_HDR_TYPE_IPV4V6_OTHER: + case FSE_AZ_RX_EV_HDR_TYPE_OTHER: + break; + } } else { flags = efx_handle_rx_not_ok(rx_queue, event); } @@ -1579,6 +1589,16 @@ static irqreturn_t efx_legacy_interrupt(int irq, void *dev_id) efx_readd(efx, ®, FR_BZ_INT_ISR0); queues = EFX_EXTRACT_DWORD(reg, 0, 31); + /* Legacy interrupts are disabled too late by the EEH kernel + * code. Disable them earlier. + * If an EEH error occurred, the read will have returned all ones. + */ + if (EFX_DWORD_IS_ALL_ONES(reg) && efx_try_recovery(efx) && + !efx->eeh_disabled_legacy_irq) { + disable_irq_nosync(efx->legacy_irq); + efx->eeh_disabled_legacy_irq = true; + } + /* Handle non-event-queue sources */ if (queues & (1U << efx->irq_level)) { syserr = EFX_OWORD_FIELD(*int_ker, FSF_AZ_NET_IVEC_FATAL_INT); @@ -1687,6 +1707,7 @@ void efx_nic_push_rx_indir_table(struct efx_nic *efx) int efx_nic_init_interrupt(struct efx_nic *efx) { struct efx_channel *channel; + unsigned int n_irqs; int rc; if (!EFX_INT_MODE_USE_MSI(efx)) { @@ -1707,7 +1728,19 @@ int efx_nic_init_interrupt(struct efx_nic *efx) return 0; } +#ifdef CONFIG_RFS_ACCEL + if (efx->interrupt_mode == EFX_INT_MODE_MSIX) { + efx->net_dev->rx_cpu_rmap = + alloc_irq_cpu_rmap(efx->n_rx_channels); + if (!efx->net_dev->rx_cpu_rmap) { + rc = -ENOMEM; + goto fail1; + } + } +#endif + /* Hook MSI or MSI-X interrupt */ + n_irqs = 0; efx_for_each_channel(channel, efx) { rc = request_irq(channel->irq, efx_msi_interrupt, IRQF_PROBE_SHARED, /* Not shared */ @@ -1718,13 +1751,31 @@ int efx_nic_init_interrupt(struct efx_nic *efx) "failed to hook IRQ %d\n", channel->irq); goto fail2; } + ++n_irqs; + +#ifdef CONFIG_RFS_ACCEL + if (efx->interrupt_mode == EFX_INT_MODE_MSIX && + channel->channel < efx->n_rx_channels) { + rc = irq_cpu_rmap_add(efx->net_dev->rx_cpu_rmap, + channel->irq); + if (rc) + goto fail2; + } +#endif } return 0; fail2: - efx_for_each_channel(channel, efx) +#ifdef CONFIG_RFS_ACCEL + free_irq_cpu_rmap(efx->net_dev->rx_cpu_rmap); + efx->net_dev->rx_cpu_rmap = NULL; +#endif + efx_for_each_channel(channel, efx) { + if (n_irqs-- == 0) + break; free_irq(channel->irq, &efx->channel[channel->channel]); + } fail1: return rc; } @@ -1734,11 +1785,14 @@ void efx_nic_fini_interrupt(struct efx_nic *efx) struct efx_channel *channel; efx_oword_t reg; +#ifdef CONFIG_RFS_ACCEL + free_irq_cpu_rmap(efx->net_dev->rx_cpu_rmap); + efx->net_dev->rx_cpu_rmap = NULL; +#endif + /* Disable MSI/MSI-X interrupts */ - efx_for_each_channel(channel, efx) { - if (channel->irq) - free_irq(channel->irq, &efx->channel[channel->channel]); - } + efx_for_each_channel(channel, efx) + free_irq(channel->irq, &efx->channel[channel->channel]); /* ACK legacy interrupt */ if (efx_nic_rev(efx) >= EFX_REV_FALCON_B0) diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h index 1b000332349..d63c2991a75 100644 --- a/drivers/net/ethernet/sfc/nic.h +++ b/drivers/net/ethernet/sfc/nic.h @@ -254,8 +254,8 @@ extern int efx_sriov_set_vf_spoofchk(struct net_device *net_dev, int vf, struct ethtool_ts_info; extern void efx_ptp_probe(struct efx_nic *efx); extern int efx_ptp_ioctl(struct efx_nic *efx, struct ifreq *ifr, int cmd); -extern int efx_ptp_get_ts_info(struct net_device *net_dev, - struct ethtool_ts_info *ts_info); +extern void efx_ptp_get_ts_info(struct efx_nic *efx, + struct ethtool_ts_info *ts_info); extern bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb); extern int efx_ptp_tx(struct efx_nic *efx, struct sk_buff *skb); extern void efx_ptp_event(struct efx_nic *efx, efx_qword_t *ev); diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c index 9a95abf2ded..b495394a6df 100644 --- a/drivers/net/ethernet/sfc/ptp.c +++ b/drivers/net/ethernet/sfc/ptp.c @@ -1203,18 +1203,16 @@ static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init) return 0; } -int -efx_ptp_get_ts_info(struct net_device *net_dev, struct ethtool_ts_info *ts_info) +void efx_ptp_get_ts_info(struct efx_nic *efx, struct ethtool_ts_info *ts_info) { - struct efx_nic *efx = netdev_priv(net_dev); struct efx_ptp_data *ptp = efx->ptp_data; if (!ptp) - return -EOPNOTSUPP; + return; - ts_info->so_timestamping = (SOF_TIMESTAMPING_TX_HARDWARE | - SOF_TIMESTAMPING_RX_HARDWARE | - SOF_TIMESTAMPING_RAW_HARDWARE); + ts_info->so_timestamping |= (SOF_TIMESTAMPING_TX_HARDWARE | + SOF_TIMESTAMPING_RX_HARDWARE | + SOF_TIMESTAMPING_RAW_HARDWARE); ts_info->phc_index = ptp_clock_index(ptp->phc_clock); ts_info->tx_types = 1 << HWTSTAMP_TX_OFF | 1 << HWTSTAMP_TX_ON; ts_info->rx_filters = (1 << HWTSTAMP_FILTER_NONE | @@ -1224,7 +1222,6 @@ efx_ptp_get_ts_info(struct net_device *net_dev, struct ethtool_ts_info *ts_info) 1 << HWTSTAMP_FILTER_PTP_V2_L4_EVENT | 1 << HWTSTAMP_FILTER_PTP_V2_L4_SYNC | 1 << HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ); - return 0; } int efx_ptp_ioctl(struct efx_nic *efx, struct ifreq *ifr, int cmd) diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index a7dfe36cabf..65646cd7af8 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -36,7 +36,7 @@ #define EFX_RECYCLE_RING_SIZE_NOIOMMU (2 * EFX_RX_PREFERRED_BATCH) /* Size of buffer allocated for skb header area. */ -#define EFX_SKB_HEADERS 64u +#define EFX_SKB_HEADERS 128u /* This is the percentage fill level below which new RX descriptors * will be added to the RX descriptor ring. @@ -598,6 +598,8 @@ static void efx_rx_deliver(struct efx_channel *channel, u8 *eh, /* Set the SKB flags */ skb_checksum_none_assert(skb); + if (likely(rx_buf->flags & EFX_RX_PKT_CSUMMED)) + skb->ip_summed = CHECKSUM_UNNECESSARY; if (channel->type->receive_skb) if (channel->type->receive_skb(channel, skb)) @@ -627,7 +629,7 @@ void __efx_rx_packet(struct efx_channel *channel) if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM))) rx_buf->flags &= ~EFX_RX_PKT_CSUMMED; - if (!channel->type->receive_skb) + if ((rx_buf->flags & EFX_RX_PKT_TCP) && !channel->type->receive_skb) efx_rx_packet_gro(channel, rx_buf, channel->rx_pkt_n_frags, eh); else efx_rx_deliver(channel, eh, rx_buf, channel->rx_pkt_n_frags); @@ -675,7 +677,7 @@ static void efx_init_rx_recycle_ring(struct efx_nic *efx, #ifdef CONFIG_PPC64 bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU; #else - if (efx->pci_dev->dev.iommu_group) + if (iommu_present(&pci_bus_type)) bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_IOMMU; else bufs_in_recycle_ring = EFX_RECYCLE_RING_SIZE_NOIOMMU; diff --git a/drivers/net/ethernet/sun/sunbmac.c b/drivers/net/ethernet/sun/sunbmac.c index 09b4f8c0b19..0d43fa9ff98 100644 --- a/drivers/net/ethernet/sun/sunbmac.c +++ b/drivers/net/ethernet/sun/sunbmac.c @@ -995,7 +995,6 @@ static void bigmac_set_multicast(struct net_device *dev) struct bigmac *bp = netdev_priv(dev); void __iomem *bregs = bp->bregs; struct netdev_hw_addr *ha; - int i; u32 tmp, crc; /* Disable the receiver. The bit self-clears when diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c index efb6f65a8be..f118d713312 100644 --- a/drivers/net/ethernet/ti/davinci_emac.c +++ b/drivers/net/ethernet/ti/davinci_emac.c @@ -1532,7 +1532,7 @@ static int emac_dev_open(struct net_device *ndev) struct device *emac_dev = &ndev->dev; u32 cnt; struct resource *res; - int q, m, ret; + int ret; int i = 0; int k = 0; struct emac_priv *priv = netdev_priv(ndev); @@ -1567,8 +1567,9 @@ static int emac_dev_open(struct net_device *ndev) while ((res = platform_get_resource(priv->pdev, IORESOURCE_IRQ, k))) { for (i = res->start; i <= res->end; i++) { - if (request_irq(i, emac_irq, IRQF_DISABLED, - ndev->name, ndev)) + if (devm_request_irq(&priv->pdev->dev, i, emac_irq, + IRQF_DISABLED, + ndev->name, ndev)) goto rollback; } k++; @@ -1641,15 +1642,7 @@ static int emac_dev_open(struct net_device *ndev) rollback: - dev_err(emac_dev, "DaVinci EMAC: request_irq() failed"); - - for (q = k; k >= 0; k--) { - for (m = i; m >= res->start; m--) - free_irq(m, ndev); - res = platform_get_resource(priv->pdev, IORESOURCE_IRQ, k-1); - m = res->end; - } - + dev_err(emac_dev, "DaVinci EMAC: devm_request_irq() failed"); ret = -EBUSY; err: pm_runtime_put(&priv->pdev->dev); @@ -1667,9 +1660,6 @@ err: */ static int emac_dev_stop(struct net_device *ndev) { - struct resource *res; - int i = 0; - int irq_num; struct emac_priv *priv = netdev_priv(ndev); struct device *emac_dev = &ndev->dev; @@ -1685,13 +1675,6 @@ static int emac_dev_stop(struct net_device *ndev) if (priv->phydev) phy_disconnect(priv->phydev); - /* Free IRQ */ - while ((res = platform_get_resource(priv->pdev, IORESOURCE_IRQ, i))) { - for (irq_num = res->start; irq_num <= res->end; irq_num++) - free_irq(irq_num, priv->ndev); - i++; - } - if (netif_msg_drv(priv)) dev_notice(emac_dev, "DaVinci EMAC: %s stopped\n", ndev->name); @@ -1771,29 +1754,22 @@ static const struct net_device_ops emac_netdev_ops = { #endif }; -#ifdef CONFIG_OF -static struct emac_platform_data - *davinci_emac_of_get_pdata(struct platform_device *pdev, - struct emac_priv *priv) +static struct emac_platform_data * +davinci_emac_of_get_pdata(struct platform_device *pdev, struct emac_priv *priv) { struct device_node *np; struct emac_platform_data *pdata = NULL; const u8 *mac_addr; - u32 data; - int ret; - pdata = pdev->dev.platform_data; - if (!pdata) { - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); - if (!pdata) - goto nodata; - } + if (!IS_ENABLED(CONFIG_OF) || !pdev->dev.of_node) + return pdev->dev.platform_data; + + pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); + if (!pdata) + return NULL; np = pdev->dev.of_node; - if (!np) - goto nodata; - else - pdata->version = EMAC_VERSION_2; + pdata->version = EMAC_VERSION_2; if (!is_valid_ether_addr(pdata->mac_addr)) { mac_addr = of_get_mac_address(np); @@ -1801,47 +1777,31 @@ static struct emac_platform_data memcpy(pdata->mac_addr, mac_addr, ETH_ALEN); } - ret = of_property_read_u32(np, "ti,davinci-ctrl-reg-offset", &data); - if (!ret) - pdata->ctrl_reg_offset = data; + of_property_read_u32(np, "ti,davinci-ctrl-reg-offset", + &pdata->ctrl_reg_offset); - ret = of_property_read_u32(np, "ti,davinci-ctrl-mod-reg-offset", - &data); - if (!ret) - pdata->ctrl_mod_reg_offset = data; + of_property_read_u32(np, "ti,davinci-ctrl-mod-reg-offset", + &pdata->ctrl_mod_reg_offset); - ret = of_property_read_u32(np, "ti,davinci-ctrl-ram-offset", &data); - if (!ret) - pdata->ctrl_ram_offset = data; + of_property_read_u32(np, "ti,davinci-ctrl-ram-offset", + &pdata->ctrl_ram_offset); - ret = of_property_read_u32(np, "ti,davinci-ctrl-ram-size", &data); - if (!ret) - pdata->ctrl_ram_size = data; + of_property_read_u32(np, "ti,davinci-ctrl-ram-size", + &pdata->ctrl_ram_size); - ret = of_property_read_u32(np, "ti,davinci-rmii-en", &data); - if (!ret) - pdata->rmii_en = data; + of_property_read_u8(np, "ti,davinci-rmii-en", &pdata->rmii_en); - ret = of_property_read_u32(np, "ti,davinci-no-bd-ram", &data); - if (!ret) - pdata->no_bd_ram = data; + pdata->no_bd_ram = of_property_read_bool(np, "ti,davinci-no-bd-ram"); priv->phy_node = of_parse_phandle(np, "phy-handle", 0); if (!priv->phy_node) pdata->phy_id = ""; pdev->dev.platform_data = pdata; -nodata: + return pdata; } -#else -static struct emac_platform_data - *davinci_emac_of_get_pdata(struct platform_device *pdev, - struct emac_priv *priv) -{ - return pdev->dev.platform_data; -} -#endif + /** * davinci_emac_probe - EMAC device probe * @pdev: The DaVinci EMAC device that we are removing @@ -1856,7 +1816,7 @@ static int davinci_emac_probe(struct platform_device *pdev) struct resource *res; struct net_device *ndev; struct emac_priv *priv; - unsigned long size, hw_ram_addr; + unsigned long hw_ram_addr; struct emac_platform_data *pdata; struct device *emac_dev; struct cpdma_params dma_params; @@ -1907,25 +1867,11 @@ static int davinci_emac_probe(struct platform_device *pdev) emac_dev = &ndev->dev; /* Get EMAC platform data */ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { - dev_err(&pdev->dev,"error getting res\n"); - rc = -ENOENT; - goto no_pdata; - } - priv->emac_base_phys = res->start + pdata->ctrl_reg_offset; - size = resource_size(res); - if (!devm_request_mem_region(&pdev->dev, res->start, - size, ndev->name)) { - dev_err(&pdev->dev, "failed request_mem_region() for regs\n"); - rc = -ENXIO; - goto no_pdata; - } - - priv->remap_addr = devm_ioremap(&pdev->dev, res->start, size); - if (!priv->remap_addr) { + priv->remap_addr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(priv->remap_addr)) { dev_err(&pdev->dev, "unable to map IO\n"); - rc = -ENOMEM; + rc = PTR_ERR(priv->remap_addr); goto no_pdata; } priv->emac_base = priv->remap_addr + pdata->ctrl_reg_offset; @@ -2076,11 +2022,13 @@ static const struct dev_pm_ops davinci_emac_pm_ops = { .resume = davinci_emac_resume, }; +#if IS_ENABLED(CONFIG_OF) static const struct of_device_id davinci_emac_of_match[] = { {.compatible = "ti,davinci-dm6467-emac", }, {}, }; MODULE_DEVICE_TABLE(of, davinci_emac_of_match); +#endif /* davinci_emac_driver: EMAC platform driver structure */ static struct platform_driver davinci_emac_driver = { diff --git a/drivers/net/ethernet/ti/davinci_mdio.c b/drivers/net/ethernet/ti/davinci_mdio.c index c47f0dbcebb..dac6f5832d1 100644 --- a/drivers/net/ethernet/ti/davinci_mdio.c +++ b/drivers/net/ethernet/ti/davinci_mdio.c @@ -291,6 +291,7 @@ static int davinci_mdio_write(struct mii_bus *bus, int phy_id, return 0; } +#if IS_ENABLED(CONFIG_OF) static int davinci_mdio_probe_dt(struct mdio_platform_data *data, struct platform_device *pdev) { @@ -308,7 +309,7 @@ static int davinci_mdio_probe_dt(struct mdio_platform_data *data, return 0; } - +#endif static int davinci_mdio_probe(struct platform_device *pdev) { @@ -477,11 +478,13 @@ static const struct dev_pm_ops davinci_mdio_pm_ops = { .resume_early = davinci_mdio_resume, }; +#if IS_ENABLED(CONFIG_OF) static const struct of_device_id davinci_mdio_of_mtable[] = { { .compatible = "ti,davinci_mdio", }, { /* sentinel */ }, }; MODULE_DEVICE_TABLE(of, davinci_mdio_of_mtable); +#endif static struct platform_driver davinci_mdio_driver = { .driver = { diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d811b06e2cc..18373b6ae37 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -638,6 +638,14 @@ static int macvlan_ethtool_get_settings(struct net_device *dev, return __ethtool_get_settings(vlan->lowerdev, cmd); } +static netdev_features_t macvlan_fix_features(struct net_device *dev, + netdev_features_t features) +{ + struct macvlan_dev *vlan = netdev_priv(dev); + + return features & (vlan->set_features | ~MACVLAN_FEATURES); +} + static const struct ethtool_ops macvlan_ethtool_ops = { .get_link = ethtool_op_get_link, .get_settings = macvlan_ethtool_get_settings, @@ -651,6 +659,7 @@ static const struct net_device_ops macvlan_netdev_ops = { .ndo_stop = macvlan_stop, .ndo_start_xmit = macvlan_start_xmit, .ndo_change_mtu = macvlan_change_mtu, + .ndo_fix_features = macvlan_fix_features, .ndo_change_rx_flags = macvlan_change_rx_flags, .ndo_set_mac_address = macvlan_set_mac_address, .ndo_set_rx_mode = macvlan_set_mac_lists, @@ -791,6 +800,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, vlan->port = port; vlan->receive = receive; vlan->forward = forward; + vlan->set_features = MACVLAN_FEATURES; vlan->mode = MACVLAN_MODE_VEPA; if (data && data[IFLA_MACVLAN_MODE]) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 5a76f20776a..5bfaecdd235 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -65,11 +65,14 @@ static struct cdev macvtap_cdev; static const struct proto_ops macvtap_socket_ops; +#define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ + NETIF_F_TSO6 | NETIF_F_UFO) +#define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO) /* * RCU usage: * The macvtap_queue and the macvlan_dev are loosely coupled, the * pointers from one to the other can only be read while rcu_read_lock - * or macvtap_lock is held. + * or rtnl is held. * * Both the file and the macvlan_dev hold a reference on the macvtap_queue * through sock_hold(&q->sk). When the macvlan_dev goes away first, @@ -81,7 +84,6 @@ static const struct proto_ops macvtap_socket_ops; * file or the dev. The data structure is freed through __sk_free * when both our references and any pending SKBs are gone. */ -static DEFINE_SPINLOCK(macvtap_lock); static int macvtap_enable_queue(struct net_device *dev, struct file *file, struct macvtap_queue *q) @@ -89,7 +91,7 @@ static int macvtap_enable_queue(struct net_device *dev, struct file *file, struct macvlan_dev *vlan = netdev_priv(dev); int err = -EINVAL; - spin_lock(&macvtap_lock); + ASSERT_RTNL(); if (q->enabled) goto out; @@ -101,7 +103,6 @@ static int macvtap_enable_queue(struct net_device *dev, struct file *file, vlan->numvtaps++; out: - spin_unlock(&macvtap_lock); return err; } @@ -111,7 +112,7 @@ static int macvtap_set_queue(struct net_device *dev, struct file *file, struct macvlan_dev *vlan = netdev_priv(dev); int err = -EBUSY; - spin_lock(&macvtap_lock); + rtnl_lock(); if (vlan->numqueues == MAX_MACVTAP_QUEUES) goto out; @@ -130,26 +131,25 @@ static int macvtap_set_queue(struct net_device *dev, struct file *file, vlan->numqueues++; out: - spin_unlock(&macvtap_lock); + rtnl_unlock(); return err; } -static int __macvtap_disable_queue(struct macvtap_queue *q) +static int macvtap_disable_queue(struct macvtap_queue *q) { struct macvlan_dev *vlan; struct macvtap_queue *nq; - vlan = rcu_dereference_protected(q->vlan, - lockdep_is_held(&macvtap_lock)); - + ASSERT_RTNL(); if (!q->enabled) return -EINVAL; + vlan = rtnl_dereference(q->vlan); + if (vlan) { int index = q->queue_index; BUG_ON(index >= vlan->numvtaps); - nq = rcu_dereference_protected(vlan->taps[vlan->numvtaps - 1], - lockdep_is_held(&macvtap_lock)); + nq = rtnl_dereference(vlan->taps[vlan->numvtaps - 1]); nq->queue_index = index; rcu_assign_pointer(vlan->taps[index], nq); @@ -162,17 +162,6 @@ static int __macvtap_disable_queue(struct macvtap_queue *q) return 0; } -static int macvtap_disable_queue(struct macvtap_queue *q) -{ - int err; - - spin_lock(&macvtap_lock); - err = __macvtap_disable_queue(q); - spin_unlock(&macvtap_lock); - - return err; -} - /* * The file owning the queue got closed, give up both * the reference that the files holds as well as the @@ -185,12 +174,12 @@ static void macvtap_put_queue(struct macvtap_queue *q) { struct macvlan_dev *vlan; - spin_lock(&macvtap_lock); - vlan = rcu_dereference_protected(q->vlan, - lockdep_is_held(&macvtap_lock)); + rtnl_lock(); + vlan = rtnl_dereference(q->vlan); + if (vlan) { if (q->enabled) - BUG_ON(__macvtap_disable_queue(q)); + BUG_ON(macvtap_disable_queue(q)); vlan->numqueues--; RCU_INIT_POINTER(q->vlan, NULL); @@ -198,7 +187,7 @@ static void macvtap_put_queue(struct macvtap_queue *q) list_del_init(&q->next); } - spin_unlock(&macvtap_lock); + rtnl_unlock(); synchronize_rcu(); sock_put(&q->sk); @@ -260,7 +249,7 @@ static void macvtap_del_queues(struct net_device *dev) struct macvtap_queue *q, *tmp, *qlist[MAX_MACVTAP_QUEUES]; int i, j = 0; - spin_lock(&macvtap_lock); + ASSERT_RTNL(); list_for_each_entry_safe(q, tmp, &vlan->queue_list, next) { list_del_init(&q->next); qlist[j++] = q; @@ -275,9 +264,6 @@ static void macvtap_del_queues(struct net_device *dev) BUG_ON(vlan->numqueues); /* guarantee that any future macvtap_set_queue will fail */ vlan->numvtaps = MAX_MACVTAP_QUEUES; - spin_unlock(&macvtap_lock); - - synchronize_rcu(); for (--j; j >= 0; j--) sock_put(&qlist[j]->sk); @@ -290,14 +276,44 @@ static void macvtap_del_queues(struct net_device *dev) */ static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) { + struct macvlan_dev *vlan = netdev_priv(dev); struct macvtap_queue *q = macvtap_get_queue(dev, skb); + netdev_features_t features; if (!q) goto drop; if (skb_queue_len(&q->sk.sk_receive_queue) >= dev->tx_queue_len) goto drop; - skb_queue_tail(&q->sk.sk_receive_queue, skb); + skb->dev = dev; + /* Apply the forward feature mask so that we perform segmentation + * according to users wishes. + */ + features = netif_skb_features(skb) & vlan->tap_features; + if (netif_needs_gso(skb, features)) { + struct sk_buff *segs = __skb_gso_segment(skb, features, false); + + if (IS_ERR(segs)) + goto drop; + + if (!segs) { + skb_queue_tail(&q->sk.sk_receive_queue, skb); + goto wake_up; + } + + kfree_skb(skb); + while (segs) { + struct sk_buff *nskb = segs->next; + + segs->next = NULL; + skb_queue_tail(&q->sk.sk_receive_queue, segs); + segs = nskb; + } + } else { + skb_queue_tail(&q->sk.sk_receive_queue, skb); + } + +wake_up: wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND); return NET_RX_SUCCESS; @@ -366,6 +382,11 @@ static int macvtap_newlink(struct net *src_net, struct macvlan_dev *vlan = netdev_priv(dev); INIT_LIST_HEAD(&vlan->queue_list); + /* Since macvlan supports all offloads by default, make + * tap support all offloads also. + */ + vlan->tap_features = TUN_OFFLOADS; + /* Don't put anything that may fail after macvlan_common_newlink * because we can't undo what it does. */ @@ -771,8 +792,8 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, skb_probe_transport_header(skb, ETH_HLEN); - rcu_read_lock_bh(); - vlan = rcu_dereference_bh(q->vlan); + rcu_read_lock(); + vlan = rcu_dereference(q->vlan); /* copy skb_ubuf_info for callback when skb has no error */ if (zerocopy) { skb_shinfo(skb)->destructor_arg = m->msg_control; @@ -783,7 +804,7 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, struct msghdr *m, macvlan_start_xmit(skb, vlan->dev); else kfree_skb(skb); - rcu_read_unlock_bh(); + rcu_read_unlock(); return total_len; @@ -791,11 +812,11 @@ err_kfree: kfree_skb(skb); err: - rcu_read_lock_bh(); - vlan = rcu_dereference_bh(q->vlan); + rcu_read_lock(); + vlan = rcu_dereference(q->vlan); if (vlan) vlan->dev->stats.tx_dropped++; - rcu_read_unlock_bh(); + rcu_read_unlock(); return err; } @@ -871,11 +892,11 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q, copied += len; done: - rcu_read_lock_bh(); - vlan = rcu_dereference_bh(q->vlan); + rcu_read_lock(); + vlan = rcu_dereference(q->vlan); if (vlan) macvlan_count_rx(vlan, copied - vnet_hdr_len, ret == 0, 0); - rcu_read_unlock_bh(); + rcu_read_unlock(); return ret ? ret : copied; } @@ -941,11 +962,10 @@ static struct macvlan_dev *macvtap_get_vlan(struct macvtap_queue *q) { struct macvlan_dev *vlan; - rcu_read_lock_bh(); - vlan = rcu_dereference_bh(q->vlan); + ASSERT_RTNL(); + vlan = rtnl_dereference(q->vlan); if (vlan) dev_hold(vlan->dev); - rcu_read_unlock_bh(); return vlan; } @@ -976,6 +996,58 @@ static int macvtap_ioctl_set_queue(struct file *file, unsigned int flags) return ret; } +static int set_offload(struct macvtap_queue *q, unsigned long arg) +{ + struct macvlan_dev *vlan; + netdev_features_t features; + netdev_features_t feature_mask = 0; + + vlan = rtnl_dereference(q->vlan); + if (!vlan) + return -ENOLINK; + + features = vlan->dev->features; + + if (arg & TUN_F_CSUM) { + feature_mask = NETIF_F_HW_CSUM; + + if (arg & (TUN_F_TSO4 | TUN_F_TSO6)) { + if (arg & TUN_F_TSO_ECN) + feature_mask |= NETIF_F_TSO_ECN; + if (arg & TUN_F_TSO4) + feature_mask |= NETIF_F_TSO; + if (arg & TUN_F_TSO6) + feature_mask |= NETIF_F_TSO6; + } + + if (arg & TUN_F_UFO) + feature_mask |= NETIF_F_UFO; + } + + /* tun/tap driver inverts the usage for TSO offloads, where + * setting the TSO bit means that the userspace wants to + * accept TSO frames and turning it off means that user space + * does not support TSO. + * For macvtap, we have to invert it to mean the same thing. + * When user space turns off TSO, we turn off GSO/LRO so that + * user-space will not receive TSO frames. + */ + if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_UFO)) + features |= RX_OFFLOADS; + else + features &= ~RX_OFFLOADS; + + /* tap_features are the same as features on tun/tap and + * reflect user expectations. + */ + vlan->tap_features = vlan->dev->features & + (feature_mask | ~TUN_OFFLOADS); + vlan->set_features = features; + netdev_update_features(vlan->dev); + + return 0; +} + /* * provide compatibility with generic tun/tap interface */ @@ -1008,21 +1080,27 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd, return ret; case TUNGETIFF: + rtnl_lock(); vlan = macvtap_get_vlan(q); - if (!vlan) + if (!vlan) { + rtnl_unlock(); return -ENOLINK; + } ret = 0; if (copy_to_user(&ifr->ifr_name, vlan->dev->name, IFNAMSIZ) || put_user(q->flags, &ifr->ifr_flags)) ret = -EFAULT; macvtap_put_vlan(vlan); + rtnl_unlock(); return ret; case TUNSETQUEUE: if (get_user(u, &ifr->ifr_flags)) return -EFAULT; - return macvtap_ioctl_set_queue(file, u); + rtnl_lock(); + ret = macvtap_ioctl_set_queue(file, u); + rtnl_unlock(); case TUNGETFEATURES: if (put_user(IFF_TAP | IFF_NO_PI | IFF_VNET_HDR | @@ -1062,7 +1140,10 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd, got enabled for forwarded frames */ if (!(q->flags & IFF_VNET_HDR)) return -EINVAL; - return 0; + rtnl_lock(); + ret = set_offload(q, arg); + rtnl_unlock(); + return ret; default: return -EINVAL; diff --git a/drivers/net/nlmon.c b/drivers/net/nlmon.c new file mode 100644 index 00000000000..dc364be6e61 --- /dev/null +++ b/drivers/net/nlmon.c @@ -0,0 +1,170 @@ +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/netlink.h> +#include <net/net_namespace.h> +#include <linux/if_arp.h> + +struct pcpu_lstats { + u64 packets; + u64 bytes; + struct u64_stats_sync syncp; +}; + +static netdev_tx_t nlmon_xmit(struct sk_buff *skb, struct net_device *dev) +{ + int len = skb->len; + struct pcpu_lstats *stats = this_cpu_ptr(dev->lstats); + + u64_stats_update_begin(&stats->syncp); + stats->bytes += len; + stats->packets++; + u64_stats_update_end(&stats->syncp); + + dev_kfree_skb(skb); + + return NETDEV_TX_OK; +} + +static int nlmon_is_valid_mtu(int new_mtu) +{ + return new_mtu >= sizeof(struct nlmsghdr) && new_mtu <= INT_MAX; +} + +static int nlmon_change_mtu(struct net_device *dev, int new_mtu) +{ + if (!nlmon_is_valid_mtu(new_mtu)) + return -EINVAL; + + dev->mtu = new_mtu; + return 0; +} + +static int nlmon_dev_init(struct net_device *dev) +{ + dev->lstats = alloc_percpu(struct pcpu_lstats); + + return dev->lstats == NULL ? -ENOMEM : 0; +} + +static void nlmon_dev_uninit(struct net_device *dev) +{ + free_percpu(dev->lstats); +} + +static struct netlink_tap nlmon_tap; + +static int nlmon_open(struct net_device *dev) +{ + return netlink_add_tap(&nlmon_tap); +} + +static int nlmon_close(struct net_device *dev) +{ + return netlink_remove_tap(&nlmon_tap); +} + +static struct rtnl_link_stats64 * +nlmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + int i; + u64 bytes = 0, packets = 0; + + for_each_possible_cpu(i) { + const struct pcpu_lstats *nl_stats; + u64 tbytes, tpackets; + unsigned int start; + + nl_stats = per_cpu_ptr(dev->lstats, i); + + do { + start = u64_stats_fetch_begin_bh(&nl_stats->syncp); + tbytes = nl_stats->bytes; + tpackets = nl_stats->packets; + } while (u64_stats_fetch_retry_bh(&nl_stats->syncp, start)); + + packets += tpackets; + bytes += tbytes; + } + + stats->rx_packets = packets; + stats->tx_packets = 0; + + stats->rx_bytes = bytes; + stats->tx_bytes = 0; + + return stats; +} + +static u32 always_on(struct net_device *dev) +{ + return 1; +} + +static const struct ethtool_ops nlmon_ethtool_ops = { + .get_link = always_on, +}; + +static const struct net_device_ops nlmon_ops = { + .ndo_init = nlmon_dev_init, + .ndo_uninit = nlmon_dev_uninit, + .ndo_open = nlmon_open, + .ndo_stop = nlmon_close, + .ndo_start_xmit = nlmon_xmit, + .ndo_get_stats64 = nlmon_get_stats64, + .ndo_change_mtu = nlmon_change_mtu, +}; + +static struct netlink_tap nlmon_tap __read_mostly = { + .module = THIS_MODULE, +}; + +static void nlmon_setup(struct net_device *dev) +{ + dev->type = ARPHRD_NETLINK; + dev->tx_queue_len = 0; + + dev->netdev_ops = &nlmon_ops; + dev->ethtool_ops = &nlmon_ethtool_ops; + dev->destructor = free_netdev; + + dev->features = NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; + dev->flags = IFF_NOARP; + + /* That's rather a softlimit here, which, of course, + * can be altered. Not a real MTU, but what is to be + * expected in most cases. + */ + dev->mtu = NLMSG_GOODSIZE; +} + +static __init int nlmon_register(void) +{ + int err; + struct net_device *nldev; + + nldev = nlmon_tap.dev = alloc_netdev(0, "netlink", nlmon_setup); + if (unlikely(nldev == NULL)) + return -ENOMEM; + + err = register_netdev(nldev); + if (unlikely(err)) + free_netdev(nldev); + + return err; +} + +static __exit void nlmon_unregister(void) +{ + struct net_device *nldev = nlmon_tap.dev; + + unregister_netdev(nldev); +} + +module_init(nlmon_register); +module_exit(nlmon_unregister); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); +MODULE_AUTHOR("Mathieu Geli <geli@enseirb.fr>"); +MODULE_DESCRIPTION("Netlink monitoring device"); diff --git a/drivers/s390/net/netiucv.c b/drivers/s390/net/netiucv.c index 9ca3996f65b..279ad504ec3 100644 --- a/drivers/s390/net/netiucv.c +++ b/drivers/s390/net/netiucv.c @@ -130,26 +130,6 @@ static inline int iucv_dbf_passes(debug_info_t *dbf_grp, int level) /** * some more debug stuff */ -#define IUCV_HEXDUMP16(importance,header,ptr) \ -PRINT_##importance(header "%02x %02x %02x %02x %02x %02x %02x %02x " \ - "%02x %02x %02x %02x %02x %02x %02x %02x\n", \ - *(((char*)ptr)),*(((char*)ptr)+1),*(((char*)ptr)+2), \ - *(((char*)ptr)+3),*(((char*)ptr)+4),*(((char*)ptr)+5), \ - *(((char*)ptr)+6),*(((char*)ptr)+7),*(((char*)ptr)+8), \ - *(((char*)ptr)+9),*(((char*)ptr)+10),*(((char*)ptr)+11), \ - *(((char*)ptr)+12),*(((char*)ptr)+13), \ - *(((char*)ptr)+14),*(((char*)ptr)+15)); \ -PRINT_##importance(header "%02x %02x %02x %02x %02x %02x %02x %02x " \ - "%02x %02x %02x %02x %02x %02x %02x %02x\n", \ - *(((char*)ptr)+16),*(((char*)ptr)+17), \ - *(((char*)ptr)+18),*(((char*)ptr)+19), \ - *(((char*)ptr)+20),*(((char*)ptr)+21), \ - *(((char*)ptr)+22),*(((char*)ptr)+23), \ - *(((char*)ptr)+24),*(((char*)ptr)+25), \ - *(((char*)ptr)+26),*(((char*)ptr)+27), \ - *(((char*)ptr)+28),*(((char*)ptr)+29), \ - *(((char*)ptr)+30),*(((char*)ptr)+31)); - #define PRINTK_HEADER " iucv: " /* for debugging */ /* dummy device to make sure netiucv_pm functions are called */ diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h index c4f392d5db4..41ef94320ee 100644 --- a/drivers/s390/net/qeth_core.h +++ b/drivers/s390/net/qeth_core.h @@ -738,7 +738,7 @@ struct qeth_rx { int qdio_err; }; -#define QETH_NAPI_WEIGHT 128 +#define QETH_NAPI_WEIGHT NAPI_POLL_WEIGHT struct qeth_card { struct list_head list; diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 6cd0fc1b203..e4ca7047519 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -1282,8 +1282,10 @@ static void qeth_free_qdio_buffers(struct qeth_card *card) qeth_free_cq(card); cancel_delayed_work_sync(&card->buffer_reclaim_work); - for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; ++j) - dev_kfree_skb_any(card->qdio.in_q->bufs[j].rx_skb); + for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; ++j) { + if (card->qdio.in_q->bufs[j].rx_skb) + dev_kfree_skb_any(card->qdio.in_q->bufs[j].rx_skb); + } kfree(card->qdio.in_q); card->qdio.in_q = NULL; /* inbound buffer pool */ @@ -1729,14 +1731,14 @@ static void qeth_configure_blkt_default(struct qeth_card *card, char *prcd) QETH_DBF_TEXT(SETUP, 2, "cfgblkt"); if (prcd[74] == 0xF0 && prcd[75] == 0xF0 && - (prcd[76] == 0xF5 || prcd[76] == 0xF6)) { - card->info.blkt.time_total = 250; - card->info.blkt.inter_packet = 5; - card->info.blkt.inter_packet_jumbo = 15; - } else { + prcd[76] >= 0xF1 && prcd[76] <= 0xF4) { card->info.blkt.time_total = 0; card->info.blkt.inter_packet = 0; card->info.blkt.inter_packet_jumbo = 0; + } else { + card->info.blkt.time_total = 250; + card->info.blkt.inter_packet = 5; + card->info.blkt.inter_packet_jumbo = 15; } } @@ -2198,11 +2200,11 @@ static inline int qeth_get_initial_mtu_for_card(struct qeth_card *card) case QETH_LINK_TYPE_LANE_TR: return 2000; default: - return 1492; + return card->options.layer2 ? 1500 : 1492; } case QETH_CARD_TYPE_OSM: case QETH_CARD_TYPE_OSX: - return 1492; + return card->options.layer2 ? 1500 : 1492; default: return 1500; } @@ -2275,9 +2277,10 @@ static int qeth_ulp_enable_cb(struct qeth_card *card, struct qeth_reply *reply, card->info.max_mtu = mtu; card->qdio.in_buf_size = mtu + 2 * PAGE_SIZE; } else { - card->info.initial_mtu = qeth_get_initial_mtu_for_card(card); card->info.max_mtu = *(__u16 *)QETH_ULP_ENABLE_RESP_MAX_MTU( iob->data); + card->info.initial_mtu = min(card->info.max_mtu, + qeth_get_initial_mtu_for_card(card)); card->qdio.in_buf_size = QETH_IN_BUF_SIZE_DEFAULT; } diff --git a/fs/select.c b/fs/select.c index 8c1c96c2706..79b876eb91d 100644 --- a/fs/select.c +++ b/fs/select.c @@ -27,6 +27,7 @@ #include <linux/rcupdate.h> #include <linux/hrtimer.h> #include <linux/sched/rt.h> +#include <net/ll_poll.h> #include <asm/uaccess.h> @@ -384,9 +385,10 @@ get_max: #define POLLEX_SET (POLLPRI) static inline void wait_key_set(poll_table *wait, unsigned long in, - unsigned long out, unsigned long bit) + unsigned long out, unsigned long bit, + unsigned int ll_flag) { - wait->_key = POLLEX_SET; + wait->_key = POLLEX_SET | ll_flag; if (in & bit) wait->_key |= POLLIN_SET; if (out & bit) @@ -400,6 +402,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) poll_table *wait; int retval, i, timed_out = 0; unsigned long slack = 0; + unsigned int ll_flag = POLL_LL; + u64 ll_time = ll_end_time(); rcu_read_lock(); retval = max_select_fd(n, fds); @@ -422,6 +426,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; + bool can_ll = false; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; @@ -449,7 +454,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) f_op = f.file->f_op; mask = DEFAULT_POLLMASK; if (f_op && f_op->poll) { - wait_key_set(wait, in, out, bit); + wait_key_set(wait, in, out, + bit, ll_flag); mask = (*f_op->poll)(f.file, wait); } fdput(f); @@ -468,6 +474,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) retval++; wait->_qproc = NULL; } + if (mask & POLL_LL) + can_ll = true; + /* got something, stop busy polling */ + if (retval) + ll_flag = 0; } } if (res_in) @@ -486,6 +497,9 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) break; } + if (can_ll && can_poll_ll(ll_time)) + continue; + /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to @@ -717,7 +731,8 @@ struct poll_list { * pwait poll_table will be used by the fd-provided poll handler for waiting, * if pwait->_qproc is non-NULL. */ -static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) +static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, + bool *can_ll, unsigned int ll_flag) { unsigned int mask; int fd; @@ -731,7 +746,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) mask = DEFAULT_POLLMASK; if (f.file->f_op && f.file->f_op->poll) { pwait->_key = pollfd->events|POLLERR|POLLHUP; + pwait->_key |= ll_flag; mask = f.file->f_op->poll(f.file, pwait); + if (mask & POLL_LL) + *can_ll = true; } /* Mask out unneeded events. */ mask &= pollfd->events | POLLERR | POLLHUP; @@ -750,6 +768,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list, ktime_t expire, *to = NULL; int timed_out = 0, count = 0; unsigned long slack = 0; + unsigned int ll_flag = POLL_LL; + u64 ll_time = ll_end_time(); /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -762,6 +782,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, for (;;) { struct poll_list *walk; + bool can_ll = false; for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; @@ -776,9 +797,10 @@ static int do_poll(unsigned int nfds, struct poll_list *list, * this. They'll get immediately deregistered * when we break out and return. */ - if (do_pollfd(pfd, pt)) { + if (do_pollfd(pfd, pt, &can_ll, ll_flag)) { count++; pt->_qproc = NULL; + ll_flag = 0; } } } @@ -795,6 +817,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list, if (count || timed_out) break; + if (can_ll && can_poll_ll(ll_time)) + continue; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index f49a9f66c3d..ddd33fd5904 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -65,6 +65,7 @@ struct macvlan_dev { DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ); + netdev_features_t set_features; enum macvlan_mode mode; u16 flags; int (*receive)(struct sk_buff *skb); @@ -75,6 +76,7 @@ struct macvlan_dev { struct list_head queue_list; int numvtaps; int numqueues; + netdev_features_t tap_features; int minor; }; diff --git a/include/linux/ktime.h b/include/linux/ktime.h index bbca12804d1..b4fa5e4cd15 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -323,6 +323,11 @@ static inline ktime_t ktime_add_us(const ktime_t kt, const u64 usec) return ktime_add_ns(kt, usec * 1000); } +static inline ktime_t ktime_add_ms(const ktime_t kt, const u64 msec) +{ + return ktime_add_ns(kt, msec * NSEC_PER_MSEC); +} + static inline ktime_t ktime_sub_us(const ktime_t kt, const u64 usec) { return ktime_sub_ns(kt, usec * 1000); @@ -366,7 +371,15 @@ extern void ktime_get_ts(struct timespec *ts); static inline ktime_t ns_to_ktime(u64 ns) { static const ktime_t ktime_zero = { .tv64 = 0 }; + return ktime_add_ns(ktime_zero, ns); } +static inline ktime_t ms_to_ktime(u64 ms) +{ + static const ktime_t ktime_zero = { .tv64 = 0 }; + + return ktime_add_ms(ktime_zero, ms); +} + #endif diff --git a/include/linux/netlink.h b/include/linux/netlink.h index f78b430f4af..86fde81ac2e 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -145,4 +145,14 @@ static inline int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, return __netlink_dump_start(ssk, skb, nlh, control); } +struct netlink_tap { + struct net_device *dev; + struct module *module; + struct list_head list; +}; + +extern int netlink_add_tap(struct netlink_tap *nt); +extern int __netlink_remove_tap(struct netlink_tap *nt); +extern int netlink_remove_tap(struct netlink_tap *nt); + #endif /* __LINUX_NETLINK_H */ diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index e07feb456d1..e4c5a2d2ba3 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -50,7 +50,7 @@ struct inet6_ifaddr { int state; - __u8 probes; + __u8 dad_probes; __u8 flags; __u16 scope; @@ -58,7 +58,7 @@ struct inet6_ifaddr { unsigned long cstamp; /* created timestamp */ unsigned long tstamp; /* updated timestamp */ - struct timer_list timer; + struct timer_list dad_timer; struct inet6_dev *idev; struct rt6_info *rt; @@ -195,6 +195,10 @@ struct inet6_dev { struct neigh_parms *nd_parms; struct ipv6_devconf cnf; struct ipv6_devstat stats; + + struct timer_list rs_timer; + __u8 rs_probes; + unsigned long tstamp; /* ipv6InterfaceTable update timestamp */ struct rcu_head rcu; }; diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h index fcc7c365cee..5bf2b3a6129 100644 --- a/include/net/ll_poll.h +++ b/include/net/ll_poll.h @@ -30,6 +30,7 @@ #ifdef CONFIG_NET_LL_RX_POLL struct napi_struct; +extern unsigned int sysctl_net_ll_read __read_mostly; extern unsigned int sysctl_net_ll_poll __read_mostly; /* return values from ndo_ll_poll */ @@ -38,17 +39,18 @@ extern unsigned int sysctl_net_ll_poll __read_mostly; /* we can use sched_clock() because we don't care much about precision * we only care that the average is bounded + * we don't mind a ~2.5% imprecision so <<10 instead of *1000 + * sk->sk_ll_usec is a u_int so this can't overflow */ -static inline u64 ll_end_time(struct sock *sk) +static inline u64 ll_sk_end_time(struct sock *sk) { - u64 end_time = ACCESS_ONCE(sk->sk_ll_usec); - - /* we don't mind a ~2.5% imprecision - * sk->sk_ll_usec is a u_int so this can't overflow - */ - end_time = (end_time << 10) + sched_clock(); + return ((u64)ACCESS_ONCE(sk->sk_ll_usec) << 10) + sched_clock(); +} - return end_time; +/* in poll/select we use the global sysctl_net_ll_poll value */ +static inline u64 ll_end_time(void) +{ + return ((u64)ACCESS_ONCE(sysctl_net_ll_poll) << 10) + sched_clock(); } static inline bool sk_valid_ll(struct sock *sk) @@ -62,10 +64,13 @@ static inline bool can_poll_ll(u64 end_time) return !time_after64(sched_clock(), end_time); } +/* when used in sock_poll() nonblock is known at compile time to be true + * so the loop and end_time will be optimized out + */ static inline bool sk_poll_ll(struct sock *sk, int nonblock) { + u64 end_time = nonblock ? 0 : ll_sk_end_time(sk); const struct net_device_ops *ops; - u64 end_time = ll_end_time(sk); struct napi_struct *napi; int rc = false; @@ -84,7 +89,6 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock) goto out; do { - rc = ops->ndo_ll_poll(napi); if (rc == LL_FLUSH_FAILED) @@ -95,8 +99,8 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock) NET_ADD_STATS_BH(sock_net(sk), LINUX_MIB_LOWLATENCYRXPACKETS, rc); - } while (skb_queue_empty(&sk->sk_receive_queue) - && can_poll_ll(end_time) && !nonblock); + } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && + can_poll_ll(end_time)); rc = !skb_queue_empty(&sk->sk_receive_queue); out: @@ -118,7 +122,12 @@ static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) #else /* CONFIG_NET_LL_RX_POLL */ -static inline u64 ll_end_time(struct sock *sk) +static inline u64 sk_ll_end_time(struct sock *sk) +{ + return 0; +} + +static inline u64 ll_end_time(void) { return 0; } diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 6321c081118..e6b95bc4d8e 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -188,11 +188,6 @@ extern struct kmem_cache *sctp_bucket_cachep __read_mostly; * Section: Macros, externs, and inlines */ - -#ifdef TEST_FRAME -#include <test_frame.h> -#else - /* spin lock wrappers. */ #define sctp_spin_lock_irqsave(lock, flags) spin_lock_irqsave(lock, flags) #define sctp_spin_unlock_irqrestore(lock, flags) \ @@ -218,8 +213,6 @@ extern struct kmem_cache *sctp_bucket_cachep __read_mostly; #define SCTP_INC_STATS_USER(net, field) SNMP_INC_STATS_USER((net)->sctp.sctp_statistics, field) #define SCTP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->sctp.sctp_statistics, field) -#endif /* !TEST_FRAME */ - /* sctp mib definitions */ enum { SCTP_MIB_NUM = 0, @@ -567,24 +560,6 @@ for (pos = chunk->subh.fwdtsn_hdr->skip;\ /* Round an int up to the next multiple of 4. */ #define WORD_ROUND(s) (((s)+3)&~3) -/* Compare two timevals. */ -#define tv_lt(s, t) \ - (s.tv_sec < t.tv_sec || (s.tv_sec == t.tv_sec && s.tv_usec < t.tv_usec)) - -/* Add tv1 to tv2. */ -#define TIMEVAL_ADD(tv1, tv2) \ -({ \ - suseconds_t usecs = (tv2).tv_usec + (tv1).tv_usec; \ - time_t secs = (tv2).tv_sec + (tv1).tv_sec; \ -\ - if (usecs >= 1000000) { \ - usecs -= 1000000; \ - secs++; \ - } \ - (tv2).tv_sec = secs; \ - (tv2).tv_usec = usecs; \ -}) - /* External references. */ extern struct proto sctp_prot; diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 1bd4c4144fe..e745c92a153 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -54,7 +54,7 @@ #ifndef __sctp_structs_h__ #define __sctp_structs_h__ -#include <linux/time.h> /* We get struct timespec. */ +#include <linux/ktime.h> #include <linux/socket.h> /* linux/in.h needs this!! */ #include <linux/in.h> /* We get struct sockaddr_in. */ #include <linux/in6.h> /* We get struct in6_addr */ @@ -284,7 +284,7 @@ struct sctp_cookie { __u32 peer_ttag; /* When does this cookie expire? */ - struct timeval expiration; + ktime_t expiration; /* Number of inbound/outbound streams which are set * and negotiated during the INIT process. @@ -1537,7 +1537,7 @@ struct sctp_association { sctp_state_t state; /* The cookie life I award for any cookie. */ - struct timeval cookie_life; + ktime_t cookie_life; /* Overall : The overall association error count. * Error Count : [Clear this any time I get something.] diff --git a/include/net/tcp.h b/include/net/tcp.h index 6fa80831dc4..d1980054ec7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1319,9 +1319,9 @@ void tcp_fastopen_cookie_gen(__be32 addr, struct tcp_fastopen_cookie *foc); /* Fastopen key context */ struct tcp_fastopen_context { - struct crypto_cipher __rcu *tfm; - __u8 key[TCP_FASTOPEN_KEY_LENGTH]; - struct rcu_head rcu; + struct crypto_cipher *tfm; + __u8 key[TCP_FASTOPEN_KEY_LENGTH]; + struct rcu_head rcu; }; /* write queue abstraction */ diff --git a/include/uapi/asm-generic/poll.h b/include/uapi/asm-generic/poll.h index 9ce7f44aebd..4aee586979c 100644 --- a/include/uapi/asm-generic/poll.h +++ b/include/uapi/asm-generic/poll.h @@ -30,6 +30,8 @@ #define POLLFREE 0x4000 /* currently only for epoll */ +#define POLL_LL 0x8000 + struct pollfd { int fd; short events; diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h index 82c7d1bdade..d7fea3496f3 100644 --- a/include/uapi/linux/if_arp.h +++ b/include/uapi/linux/if_arp.h @@ -93,6 +93,7 @@ #define ARPHRD_PHONET_PIPE 821 /* PhoNet pipe header */ #define ARPHRD_CAIF 822 /* CAIF media type */ #define ARPHRD_IP6GRE 823 /* GRE over IPv6 */ +#define ARPHRD_NETLINK 824 /* Netlink header */ #define ARPHRD_VOID 0xFFFF /* Void type, nothing is known */ #define ARPHRD_NONE 0xFFFE /* zero header length */ diff --git a/net/core/sock.c b/net/core/sock.c index 1e744b12fda..b6c619f4d47 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2307,7 +2307,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) #ifdef CONFIG_NET_LL_RX_POLL sk->sk_napi_id = 0; - sk->sk_ll_usec = sysctl_net_ll_poll; + sk->sk_ll_usec = sysctl_net_ll_read; #endif /* diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 62702c2053d..afc677eadd9 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -306,6 +306,14 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "low_latency_read", + .data = &sysctl_net_ll_read, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +# #endif #endif /* CONFIG_NET */ { diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 90788a1c6bb..afaf3cdadf5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -253,37 +253,32 @@ static inline bool addrconf_qdisc_ok(const struct net_device *dev) return !qdisc_tx_is_noop(dev); } -static void addrconf_del_timer(struct inet6_ifaddr *ifp) +static void addrconf_del_rs_timer(struct inet6_dev *idev) { - if (del_timer(&ifp->timer)) + if (del_timer(&idev->rs_timer)) + __in6_dev_put(idev); +} + +static void addrconf_del_dad_timer(struct inet6_ifaddr *ifp) +{ + if (del_timer(&ifp->dad_timer)) __in6_ifa_put(ifp); } -enum addrconf_timer_t { - AC_NONE, - AC_DAD, - AC_RS, -}; +static void addrconf_mod_rs_timer(struct inet6_dev *idev, + unsigned long when) +{ + if (!timer_pending(&idev->rs_timer)) + in6_dev_hold(idev); + mod_timer(&idev->rs_timer, jiffies + when); +} -static void addrconf_mod_timer(struct inet6_ifaddr *ifp, - enum addrconf_timer_t what, - unsigned long when) +static void addrconf_mod_dad_timer(struct inet6_ifaddr *ifp, + unsigned long when) { - if (!del_timer(&ifp->timer)) + if (!timer_pending(&ifp->dad_timer)) in6_ifa_hold(ifp); - - switch (what) { - case AC_DAD: - ifp->timer.function = addrconf_dad_timer; - break; - case AC_RS: - ifp->timer.function = addrconf_rs_timer; - break; - default: - break; - } - ifp->timer.expires = jiffies + when; - add_timer(&ifp->timer); + mod_timer(&ifp->dad_timer, jiffies + when); } static int snmp6_alloc_dev(struct inet6_dev *idev) @@ -326,6 +321,7 @@ void in6_dev_finish_destroy(struct inet6_dev *idev) WARN_ON(!list_empty(&idev->addr_list)); WARN_ON(idev->mc_list != NULL); + WARN_ON(timer_pending(&idev->rs_timer)); #ifdef NET_REFCNT_DEBUG pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL"); @@ -357,7 +353,8 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) rwlock_init(&ndev->lock); ndev->dev = dev; INIT_LIST_HEAD(&ndev->addr_list); - + setup_timer(&ndev->rs_timer, addrconf_rs_timer, + (unsigned long)ndev); memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf)); ndev->cnf.mtu6 = dev->mtu; ndev->cnf.sysctl = NULL; @@ -776,7 +773,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) in6_dev_put(ifp->idev); - if (del_timer(&ifp->timer)) + if (del_timer(&ifp->dad_timer)) pr_notice("Timer is still running, when freeing ifa=%p\n", ifp); if (ifp->state != INET6_IFADDR_STATE_DEAD) { @@ -869,9 +866,9 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, spin_lock_init(&ifa->lock); spin_lock_init(&ifa->state_lock); - init_timer(&ifa->timer); + setup_timer(&ifa->dad_timer, addrconf_dad_timer, + (unsigned long)ifa); INIT_HLIST_NODE(&ifa->addr_lst); - ifa->timer.data = (unsigned long) ifa; ifa->scope = scope; ifa->prefix_len = pfxlen; ifa->flags = flags | IFA_F_TENTATIVE; @@ -994,7 +991,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) } write_unlock_bh(&idev->lock); - addrconf_del_timer(ifp); + addrconf_del_dad_timer(ifp); ipv6_ifa_notify(RTM_DELADDR, ifp); @@ -1447,6 +1444,23 @@ try_nextdev: } EXPORT_SYMBOL(ipv6_dev_get_saddr); +static int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, + unsigned char banned_flags) +{ + struct inet6_ifaddr *ifp; + int err = -EADDRNOTAVAIL; + + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if (ifp->scope == IFA_LINK && + !(ifp->flags & banned_flags)) { + *addr = ifp->addr; + err = 0; + break; + } + } + return err; +} + int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, unsigned char banned_flags) { @@ -1456,17 +1470,8 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) { - struct inet6_ifaddr *ifp; - read_lock_bh(&idev->lock); - list_for_each_entry(ifp, &idev->addr_list, if_list) { - if (ifp->scope == IFA_LINK && - !(ifp->flags & banned_flags)) { - *addr = ifp->addr; - err = 0; - break; - } - } + err = __ipv6_get_lladdr(idev, addr, banned_flags); read_unlock_bh(&idev->lock); } rcu_read_unlock(); @@ -1580,7 +1585,7 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) { if (ifp->flags&IFA_F_PERMANENT) { spin_lock_bh(&ifp->lock); - addrconf_del_timer(ifp); + addrconf_del_dad_timer(ifp); ifp->flags |= IFA_F_TENTATIVE; if (dad_failed) ifp->flags |= IFA_F_DADFAILED; @@ -2502,12 +2507,6 @@ static int inet6_addr_del(struct net *net, int ifindex, const struct in6_addr *p read_unlock_bh(&idev->lock); ipv6_del_addr(ifp); - - /* If the last address is deleted administratively, - disable IPv6 on this interface. - */ - if (list_empty(&idev->addr_list)) - addrconf_ifdown(idev->dev, 1); return 0; } } @@ -3036,7 +3035,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) hlist_for_each_entry_rcu(ifa, h, addr_lst) { if (ifa->idev == idev) { hlist_del_init_rcu(&ifa->addr_lst); - addrconf_del_timer(ifa); + addrconf_del_dad_timer(ifa); goto restart; } } @@ -3045,6 +3044,8 @@ static int addrconf_ifdown(struct net_device *dev, int how) write_lock_bh(&idev->lock); + addrconf_del_rs_timer(idev); + /* Step 2: clear flags for stateless addrconf */ if (!how) idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY); @@ -3074,7 +3075,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) while (!list_empty(&idev->addr_list)) { ifa = list_first_entry(&idev->addr_list, struct inet6_ifaddr, if_list); - addrconf_del_timer(ifa); + addrconf_del_dad_timer(ifa); list_del(&ifa->if_list); @@ -3116,10 +3117,10 @@ static int addrconf_ifdown(struct net_device *dev, int how) static void addrconf_rs_timer(unsigned long data) { - struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; - struct inet6_dev *idev = ifp->idev; + struct inet6_dev *idev = (struct inet6_dev *)data; + struct in6_addr lladdr; - read_lock(&idev->lock); + write_lock(&idev->lock); if (idev->dead || !(idev->if_flags & IF_READY)) goto out; @@ -3130,18 +3131,19 @@ static void addrconf_rs_timer(unsigned long data) if (idev->if_flags & IF_RA_RCVD) goto out; - spin_lock(&ifp->lock); - if (ifp->probes++ < idev->cnf.rtr_solicits) { - /* The wait after the last probe can be shorter */ - addrconf_mod_timer(ifp, AC_RS, - (ifp->probes == idev->cnf.rtr_solicits) ? - idev->cnf.rtr_solicit_delay : - idev->cnf.rtr_solicit_interval); - spin_unlock(&ifp->lock); + if (idev->rs_probes++ < idev->cnf.rtr_solicits) { + if (!__ipv6_get_lladdr(idev, &lladdr, IFA_F_TENTATIVE)) + ndisc_send_rs(idev->dev, &lladdr, + &in6addr_linklocal_allrouters); + else + goto out; - ndisc_send_rs(idev->dev, &ifp->addr, &in6addr_linklocal_allrouters); + /* The wait after the last probe can be shorter */ + addrconf_mod_rs_timer(idev, (idev->rs_probes == + idev->cnf.rtr_solicits) ? + idev->cnf.rtr_solicit_delay : + idev->cnf.rtr_solicit_interval); } else { - spin_unlock(&ifp->lock); /* * Note: we do not support deprecated "all on-link" * assumption any longer. @@ -3150,8 +3152,8 @@ static void addrconf_rs_timer(unsigned long data) } out: - read_unlock(&idev->lock); - in6_ifa_put(ifp); + write_unlock(&idev->lock); + in6_dev_put(idev); } /* @@ -3167,8 +3169,8 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp) else rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); - ifp->probes = idev->cnf.dad_transmits; - addrconf_mod_timer(ifp, AC_DAD, rand_num); + ifp->dad_probes = idev->cnf.dad_transmits; + addrconf_mod_dad_timer(ifp, rand_num); } static void addrconf_dad_start(struct inet6_ifaddr *ifp) @@ -3229,40 +3231,40 @@ static void addrconf_dad_timer(unsigned long data) struct inet6_dev *idev = ifp->idev; struct in6_addr mcaddr; - if (!ifp->probes && addrconf_dad_end(ifp)) + if (!ifp->dad_probes && addrconf_dad_end(ifp)) goto out; - read_lock(&idev->lock); + write_lock(&idev->lock); if (idev->dead || !(idev->if_flags & IF_READY)) { - read_unlock(&idev->lock); + write_unlock(&idev->lock); goto out; } spin_lock(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_DEAD) { spin_unlock(&ifp->lock); - read_unlock(&idev->lock); + write_unlock(&idev->lock); goto out; } - if (ifp->probes == 0) { + if (ifp->dad_probes == 0) { /* * DAD was successful */ ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED); spin_unlock(&ifp->lock); - read_unlock(&idev->lock); + write_unlock(&idev->lock); addrconf_dad_completed(ifp); goto out; } - ifp->probes--; - addrconf_mod_timer(ifp, AC_DAD, ifp->idev->nd_parms->retrans_time); + ifp->dad_probes--; + addrconf_mod_dad_timer(ifp, ifp->idev->nd_parms->retrans_time); spin_unlock(&ifp->lock); - read_unlock(&idev->lock); + write_unlock(&idev->lock); /* send a neighbour solicitation for our addr */ addrconf_addr_solict_mult(&ifp->addr, &mcaddr); @@ -3274,6 +3276,9 @@ out: static void addrconf_dad_completed(struct inet6_ifaddr *ifp) { struct net_device *dev = ifp->idev->dev; + struct in6_addr lladdr; + + addrconf_del_dad_timer(ifp); /* * Configure the address for reception. Now it is valid. @@ -3294,13 +3299,20 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) * [...] as part of DAD [...] there is no need * to delay again before sending the first RS */ - ndisc_send_rs(ifp->idev->dev, &ifp->addr, &in6addr_linklocal_allrouters); + if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE)) + ndisc_send_rs(dev, &lladdr, + &in6addr_linklocal_allrouters); + else + return; - spin_lock_bh(&ifp->lock); - ifp->probes = 1; + write_lock_bh(&ifp->idev->lock); + spin_lock(&ifp->lock); + ifp->idev->rs_probes = 1; ifp->idev->if_flags |= IF_RS_SENT; - addrconf_mod_timer(ifp, AC_RS, ifp->idev->cnf.rtr_solicit_interval); - spin_unlock_bh(&ifp->lock); + addrconf_mod_rs_timer(ifp->idev, + ifp->idev->cnf.rtr_solicit_interval); + spin_unlock(&ifp->lock); + write_unlock_bh(&ifp->idev->lock); } } @@ -4363,6 +4375,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token) } write_unlock_bh(&idev->lock); + addrconf_verify(0); return 0; } diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 72104562c86..d2f87427244 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -5,6 +5,7 @@ #include <linux/export.h> #include <net/ipv6.h> +#include <net/addrconf.h> #define IPV6_ADDR_SCOPE_TYPE(scope) ((scope) << 16) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 275d901d7e4..6967fbcca6c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -57,6 +57,7 @@ #include <linux/audit.h> #include <linux/mutex.h> #include <linux/vmalloc.h> +#include <linux/if_arp.h> #include <asm/cacheflush.h> #include <net/net_namespace.h> @@ -101,6 +102,9 @@ static atomic_t nl_table_users = ATOMIC_INIT(0); static ATOMIC_NOTIFIER_HEAD(netlink_chain); +static DEFINE_SPINLOCK(netlink_tap_lock); +static struct list_head netlink_tap_all __read_mostly; + static inline u32 netlink_group_mask(u32 group) { return group ? 1 << (group - 1) : 0; @@ -111,6 +115,100 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; } +int netlink_add_tap(struct netlink_tap *nt) +{ + if (unlikely(nt->dev->type != ARPHRD_NETLINK)) + return -EINVAL; + + spin_lock(&netlink_tap_lock); + list_add_rcu(&nt->list, &netlink_tap_all); + spin_unlock(&netlink_tap_lock); + + if (nt->module) + __module_get(nt->module); + + return 0; +} +EXPORT_SYMBOL_GPL(netlink_add_tap); + +int __netlink_remove_tap(struct netlink_tap *nt) +{ + bool found = false; + struct netlink_tap *tmp; + + spin_lock(&netlink_tap_lock); + + list_for_each_entry(tmp, &netlink_tap_all, list) { + if (nt == tmp) { + list_del_rcu(&nt->list); + found = true; + goto out; + } + } + + pr_warn("__netlink_remove_tap: %p not found\n", nt); +out: + spin_unlock(&netlink_tap_lock); + + if (found && nt->module) + module_put(nt->module); + + return found ? 0 : -ENODEV; +} +EXPORT_SYMBOL_GPL(__netlink_remove_tap); + +int netlink_remove_tap(struct netlink_tap *nt) +{ + int ret; + + ret = __netlink_remove_tap(nt); + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL_GPL(netlink_remove_tap); + +static int __netlink_deliver_tap_skb(struct sk_buff *skb, + struct net_device *dev) +{ + struct sk_buff *nskb; + int ret = -ENOMEM; + + dev_hold(dev); + nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb) { + nskb->dev = dev; + ret = dev_queue_xmit(nskb); + if (unlikely(ret > 0)) + ret = net_xmit_errno(ret); + } + + dev_put(dev); + return ret; +} + +static void __netlink_deliver_tap(struct sk_buff *skb) +{ + int ret; + struct netlink_tap *tmp; + + list_for_each_entry_rcu(tmp, &netlink_tap_all, list) { + ret = __netlink_deliver_tap_skb(skb, tmp->dev); + if (unlikely(ret)) + break; + } +} + +static void netlink_deliver_tap(struct sk_buff *skb) +{ + rcu_read_lock(); + + if (unlikely(!list_empty(&netlink_tap_all))) + __netlink_deliver_tap(skb); + + rcu_read_unlock(); +} + static void netlink_overrun(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); @@ -1518,6 +1616,8 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; + netlink_deliver_tap(skb); + #ifdef CONFIG_NETLINK_MMAP if (netlink_skb_is_mmaped(skb)) netlink_queue_mmaped_skb(sk, skb); @@ -1578,6 +1678,11 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, ret = -ECONNREFUSED; if (nlk->netlink_rcv != NULL) { + /* We could do a netlink_deliver_tap(skb) here as well + * but since this is intended for the kernel only, we + * should rather let it stay under the hood. + */ + ret = skb->len; netlink_skb_set_owner_r(skb, sk); NETLINK_CB(skb).sk = ssk; @@ -2975,6 +3080,8 @@ static int __init netlink_proto_init(void) nl_table[i].compare = netlink_compare; } + INIT_LIST_HEAD(&netlink_tap_all); + netlink_add_usersock_entry(); sock_register(&netlink_family_ops); diff --git a/net/sctp/associola.c b/net/sctp/associola.c index bf6e6bd553c..9a383a8774e 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -102,13 +102,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a sctp_bind_addr_init(&asoc->base.bind_addr, ep->base.bind_addr.port); asoc->state = SCTP_STATE_CLOSED; - - /* Set these values from the socket values, a conversion between - * millsecons to seconds/microseconds must also be done. - */ - asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000; - asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000) - * 1000; + asoc->cookie_life = ms_to_ktime(sp->assocparams.sasoc_cookie_life); asoc->frag_point = 0; asoc->user_frag = sp->user_frag; diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index a8b26741c0a..b26999d508b 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -247,10 +247,9 @@ void sctp_endpoint_free(struct sctp_endpoint *ep) /* Final destructor for endpoint. */ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) { - SCTP_ASSERT(ep->base.dead, "Endpoint is not dead", return); + struct sock *sk; - /* Free up the HMAC transform. */ - crypto_free_hash(sctp_sk(ep->base.sk)->hmac); + SCTP_ASSERT(ep->base.dead, "Endpoint is not dead", return); /* Free the digest buffer */ kfree(ep->digest); @@ -271,13 +270,15 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) memset(ep->secret_key, 0, sizeof(ep->secret_key)); - /* Remove and free the port */ - if (sctp_sk(ep->base.sk)->bind_hash) - sctp_put_port(ep->base.sk); - /* Give up our hold on the sock. */ - if (ep->base.sk) - sock_put(ep->base.sk); + sk = ep->base.sk; + if (sk != NULL) { + /* Remove and free the port */ + if (sctp_sk(sk)->bind_hash) + sctp_put_port(sk); + + sock_put(sk); + } kfree(ep); SCTP_DBG_OBJCNT_DEC(ep); diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 0c83162a6bf..62526c47705 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -138,7 +138,7 @@ static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_commo peer = asoc->peer.primary_path; if (unlikely(peer == NULL)) { - WARN(1, "Association %p with NULL primary path!", asoc); + WARN(1, "Association %p with NULL primary path!\n", asoc); return; } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index fc8548743ed..dd71f1f9ba1 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1630,8 +1630,8 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, cookie->c.adaptation_ind = asoc->peer.adaptation_ind; /* Set an expiration time for the cookie. */ - do_gettimeofday(&cookie->c.expiration); - TIMEVAL_ADD(asoc->cookie_life, cookie->c.expiration); + cookie->c.expiration = ktime_add(asoc->cookie_life, + ktime_get()); /* Copy the peer's init packet. */ memcpy(&cookie->c.peer_init[0], init_chunk->chunk_hdr, @@ -1680,7 +1680,7 @@ struct sctp_association *sctp_unpack_cookie( unsigned int len; sctp_scope_t scope; struct sk_buff *skb = chunk->skb; - struct timeval tv; + ktime_t kt; struct hash_desc desc; /* Header size is static data prior to the actual cookie, including @@ -1757,11 +1757,11 @@ no_hmac: * down the new association establishment instead of every packet. */ if (sock_flag(ep->base.sk, SOCK_TIMESTAMP)) - skb_get_timestamp(skb, &tv); + kt = skb_get_ktime(skb); else - do_gettimeofday(&tv); + kt = ktime_get(); - if (!asoc && tv_lt(bear_cookie->expiration, tv)) { + if (!asoc && ktime_compare(bear_cookie->expiration, kt) < 0) { /* * Section 3.3.10.3 Stale Cookie Error (3) * @@ -1773,9 +1773,7 @@ no_hmac: len = ntohs(chunk->chunk_hdr->length); *errp = sctp_make_op_error_space(asoc, chunk, len); if (*errp) { - suseconds_t usecs = (tv.tv_sec - - bear_cookie->expiration.tv_sec) * 1000000L + - tv.tv_usec - bear_cookie->expiration.tv_usec; + suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration)); __be32 n = htonl(usecs); sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE, @@ -2514,8 +2512,7 @@ do_addr_param: /* Suggested Cookie Life span increment's unit is msec, * (1/1000sec). */ - asoc->cookie_life.tv_sec += stale / 1000; - asoc->cookie_life.tv_usec += (stale % 1000) * 1000; + asoc->cookie_life = ktime_add_ms(asoc->cookie_life, stale); break; case SCTP_PARAM_HOST_NAME_ADDRESS: diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 32db19ba4a2..66fcdcfe1b7 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -93,6 +93,7 @@ static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); static void sctp_wait_for_close(struct sock *sk, long timeo); +static void sctp_destruct_sock(struct sock *sk); static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, union sctp_addr *addr, int len); static int sctp_bindx_add(struct sock *, struct sockaddr *, int); @@ -2910,13 +2911,8 @@ static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, unsig asoc->max_retrans = assocparams.sasoc_asocmaxrxt; } - if (assocparams.sasoc_cookie_life != 0) { - asoc->cookie_life.tv_sec = - assocparams.sasoc_cookie_life / 1000; - asoc->cookie_life.tv_usec = - (assocparams.sasoc_cookie_life % 1000) - * 1000; - } + if (assocparams.sasoc_cookie_life != 0) + asoc->cookie_life = ms_to_ktime(assocparams.sasoc_cookie_life); } else { /* Set the values to the endpoint */ struct sctp_sock *sp = sctp_sk(sk); @@ -3971,6 +3967,8 @@ static int sctp_init_sock(struct sock *sk) sp->hmac = NULL; + sk->sk_destruct = sctp_destruct_sock; + SCTP_DBG_OBJCNT_INC(sock); local_bh_disable(); @@ -4013,6 +4011,17 @@ static void sctp_destroy_sock(struct sock *sk) local_bh_enable(); } +/* Triggered when there are no references on the socket anymore */ +static void sctp_destruct_sock(struct sock *sk) +{ + struct sctp_sock *sp = sctp_sk(sk); + + /* Free up the HMAC transform. */ + crypto_free_hash(sp->hmac); + + inet_sock_destruct(sk); +} + /* API 4.1.7 shutdown() - TCP Style Syntax * int shutdown(int socket, int how); * @@ -5074,10 +5083,7 @@ static int sctp_getsockopt_associnfo(struct sock *sk, int len, assocparams.sasoc_asocmaxrxt = asoc->max_retrans; assocparams.sasoc_peer_rwnd = asoc->peer.rwnd; assocparams.sasoc_local_rwnd = asoc->a_rwnd; - assocparams.sasoc_cookie_life = (asoc->cookie_life.tv_sec - * 1000) + - (asoc->cookie_life.tv_usec - / 1000); + assocparams.sasoc_cookie_life = ktime_to_ms(asoc->cookie_life); list_for_each(pos, &asoc->peer.transport_addr_list) { cnt ++; @@ -6030,7 +6036,6 @@ fail: */ static int sctp_get_port(struct sock *sk, unsigned short snum) { - long ret; union sctp_addr addr; struct sctp_af *af = sctp_sk(sk)->pf->af; @@ -6039,9 +6044,7 @@ static int sctp_get_port(struct sock *sk, unsigned short snum) addr.v4.sin_port = htons(snum); /* Note: sk->sk_num gets filled in if ephemeral port request. */ - ret = sctp_get_port_local(sk, &addr); - - return ret ? 1 : 0; + return !!sctp_get_port_local(sk, &addr); } /* @@ -6856,7 +6859,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newsk->sk_reuse = sk->sk_reuse; newsk->sk_shutdown = sk->sk_shutdown; - newsk->sk_destruct = inet_sock_destruct; + newsk->sk_destruct = sctp_destruct_sock; newsk->sk_family = sk->sk_family; newsk->sk_protocol = IPPROTO_SCTP; newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; diff --git a/net/socket.c b/net/socket.c index 3eec3f76b49..4da14cbd49b 100644 --- a/net/socket.c +++ b/net/socket.c @@ -107,6 +107,7 @@ #include <net/ll_poll.h> #ifdef CONFIG_NET_LL_RX_POLL +unsigned int sysctl_net_ll_read __read_mostly; unsigned int sysctl_net_ll_poll __read_mostly; #endif @@ -1147,13 +1148,24 @@ EXPORT_SYMBOL(sock_create_lite); /* No kernel lock held - perfect */ static unsigned int sock_poll(struct file *file, poll_table *wait) { + unsigned int ll_flag = 0; struct socket *sock; /* * We can't return errors to poll, so it's either yes or no. */ sock = file->private_data; - return sock->ops->poll(file, sock, wait); + + if (sk_valid_ll(sock->sk)) { + /* this socket can poll_ll so tell the system call */ + ll_flag = POLL_LL; + + /* once, only if requested by syscall */ + if (wait && (wait->_key & POLL_LL)) + sk_poll_ll(sock->sk, 1); + } + + return ll_flag | sock->ops->poll(file, sock, wait); } static int sock_mmap(struct file *file, struct vm_area_struct *vma) |