aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-08-12 06:43:53 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2011-08-12 06:43:53 -0700
commitce8a84ef1e4b30bcee78aa99bc1032db90a6c1c4 (patch)
tree3faf99c6fbd99eedce3ad2193ce779c25bfc8064
parenteeca7360f756f7e36e846f35018df20808c7ef63 (diff)
parentd80bcf46f1dae47805260dc60fb900cc4dabe35e (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: (44 commits) e1000e: increase driver version number e1000e: alternate MAC address update e1000e: do not disable receiver on 82574/82583 e1000e: alternate MAC address does not work on device id 0x1060 PCnet: Fix section mismatch bnx2x: disable dcb on 578xx since not supported yet bnx2x: properly clean indirect addresses bnx2x: prevent race between undi_unload and load flows bnx2x: fix select_queue when FCoE is disabled bnx2x: init FCOE FP only once ipv4: some rt_iif -> rt_route_iif conversions net/bridge/netfilter/ebtables.c: use available error handling code net/netlabel/netlabel_kapi.c: add missing cleanup code net/irda: sh_sir: tidyup compile warning net/irda: sh_sir: add missing header net/irda: sh_irda: add missing header slcan: ldisc generated skbs are received in softirq context scm: Capture the full credentials of the scm sender tcp: initialize variable ecn_ok in syncookies path drivers/net/wireless/wl1251: add missing kfree ...
-rw-r--r--Documentation/networking/bonding.txt29
-rw-r--r--Documentation/networking/scaling.txt371
-rw-r--r--drivers/net/bnx2x/bnx2x_cmn.c35
-rw-r--r--drivers/net/bnx2x/bnx2x_dcb.c2
-rw-r--r--drivers/net/bnx2x/bnx2x_main.c23
-rw-r--r--drivers/net/bnx2x/bnx2x_reg.h26
-rw-r--r--drivers/net/can/slcan.c2
-rw-r--r--drivers/net/e1000e/82571.c6
-rw-r--r--drivers/net/e1000e/e1000.h1
-rw-r--r--drivers/net/e1000e/ethtool.c3
-rw-r--r--drivers/net/e1000e/lib.c7
-rw-r--r--drivers/net/e1000e/netdev.c9
-rw-r--r--drivers/net/gianfar_ptp.c9
-rw-r--r--drivers/net/irda/sh_irda.c2
-rw-r--r--drivers/net/irda/sh_sir.c4
-rw-r--r--drivers/net/pcnet32.c2
-rw-r--r--drivers/net/phy/dp83640.c5
-rw-r--r--drivers/net/slip.c2
-rw-r--r--drivers/net/usb/rtl8150.c1
-rw-r--r--drivers/net/wireless/ath/ath5k/base.c23
-rw-r--r--drivers/net/wireless/ath/ath9k/ar9003_eeprom.c8
-rw-r--r--drivers/net/wireless/ath/ath9k/ar9003_phy.h2
-rw-r--r--drivers/net/wireless/b43/dma.c20
-rw-r--r--drivers/net/wireless/rt2x00/rt2800usb.c2
-rw-r--r--drivers/net/wireless/rt2x00/rt73usb.c1
-rw-r--r--drivers/net/wireless/rtlwifi/rtl8192cu/sw.c11
-rw-r--r--drivers/net/wireless/wl1251/acx.c6
-rw-r--r--drivers/net/wireless/wl1251/cmd.c2
-rw-r--r--fs/compat_ioctl.c1
-rw-r--r--include/linux/netlink.h2
-rw-r--r--include/linux/socket.h6
-rw-r--r--include/net/inet_sock.h2
-rw-r--r--net/bridge/br_if.c6
-rw-r--r--net/bridge/br_notify.c7
-rw-r--r--net/bridge/netfilter/ebtables.c3
-rw-r--r--net/core/scm.c2
-rw-r--r--net/ipv4/ip_output.c1
-rw-r--r--net/ipv4/ip_sockglue.c9
-rw-r--r--net/ipv4/netfilter.c18
-rw-r--r--net/ipv4/raw.c3
-rw-r--r--net/ipv4/route.c9
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv6/syncookies.c2
-rw-r--r--net/netfilter/nf_queue.c1
-rw-r--r--net/netlabel/netlabel_kapi.c20
-rw-r--r--net/sched/sch_prio.c2
46 files changed, 604 insertions, 106 deletions
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 5dd960d7517..91df678fb7f 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -238,6 +238,18 @@ ad_select
This option was added in bonding version 3.4.0.
+all_slaves_active
+
+ Specifies that duplicate frames (received on inactive ports) should be
+ dropped (0) or delivered (1).
+
+ Normally, bonding will drop duplicate frames (received on inactive
+ ports), which is desirable for most users. But there are some times
+ it is nice to allow duplicate frames to be delivered.
+
+ The default value is 0 (drop duplicate frames received on inactive
+ ports).
+
arp_interval
Specifies the ARP link monitoring frequency in milliseconds.
@@ -433,6 +445,23 @@ miimon
determined. See the High Availability section for additional
information. The default value is 0.
+min_links
+
+ Specifies the minimum number of links that must be active before
+ asserting carrier. It is similar to the Cisco EtherChannel min-links
+ feature. This allows setting the minimum number of member ports that
+ must be up (link-up state) before marking the bond device as up
+ (carrier on). This is useful for situations where higher level services
+ such as clustering want to ensure a minimum number of low bandwidth
+ links are active before switchover. This option only affect 802.3ad
+ mode.
+
+ The default value is 0. This will cause carrier to be asserted (for
+ 802.3ad mode) whenever there is an active aggregator, regardless of the
+ number of available links in that aggregator. Note that, because an
+ aggregator cannot be active without at least one available link,
+ setting this option to 0 or to 1 has the exact same effect.
+
mode
Specifies one of the bonding policies. The default is
diff --git a/Documentation/networking/scaling.txt b/Documentation/networking/scaling.txt
new file mode 100644
index 00000000000..7254b4b5910
--- /dev/null
+++ b/Documentation/networking/scaling.txt
@@ -0,0 +1,371 @@
+Scaling in the Linux Networking Stack
+
+
+Introduction
+============
+
+This document describes a set of complementary techniques in the Linux
+networking stack to increase parallelism and improve performance for
+multi-processor systems.
+
+The following technologies are described:
+
+ RSS: Receive Side Scaling
+ RPS: Receive Packet Steering
+ RFS: Receive Flow Steering
+ Accelerated Receive Flow Steering
+ XPS: Transmit Packet Steering
+
+
+RSS: Receive Side Scaling
+=========================
+
+Contemporary NICs support multiple receive and transmit descriptor queues
+(multi-queue). On reception, a NIC can send different packets to different
+queues to distribute processing among CPUs. The NIC distributes packets by
+applying a filter to each packet that assigns it to one of a small number
+of logical flows. Packets for each flow are steered to a separate receive
+queue, which in turn can be processed by separate CPUs. This mechanism is
+generally known as “Receive-side Scaling” (RSS). The goal of RSS and
+the other scaling techniques to increase performance uniformly.
+Multi-queue distribution can also be used for traffic prioritization, but
+that is not the focus of these techniques.
+
+The filter used in RSS is typically a hash function over the network
+and/or transport layer headers-- for example, a 4-tuple hash over
+IP addresses and TCP ports of a packet. The most common hardware
+implementation of RSS uses a 128-entry indirection table where each entry
+stores a queue number. The receive queue for a packet is determined
+by masking out the low order seven bits of the computed hash for the
+packet (usually a Toeplitz hash), taking this number as a key into the
+indirection table and reading the corresponding value.
+
+Some advanced NICs allow steering packets to queues based on
+programmable filters. For example, webserver bound TCP port 80 packets
+can be directed to their own receive queue. Such “n-tuple” filters can
+be configured from ethtool (--config-ntuple).
+
+==== RSS Configuration
+
+The driver for a multi-queue capable NIC typically provides a kernel
+module parameter for specifying the number of hardware queues to
+configure. In the bnx2x driver, for instance, this parameter is called
+num_queues. A typical RSS configuration would be to have one receive queue
+for each CPU if the device supports enough queues, or otherwise at least
+one for each cache domain at a particular cache level (L1, L2, etc.).
+
+The indirection table of an RSS device, which resolves a queue by masked
+hash, is usually programmed by the driver at initialization. The
+default mapping is to distribute the queues evenly in the table, but the
+indirection table can be retrieved and modified at runtime using ethtool
+commands (--show-rxfh-indir and --set-rxfh-indir). Modifying the
+indirection table could be done to give different queues different
+relative weights.
+
+== RSS IRQ Configuration
+
+Each receive queue has a separate IRQ associated with it. The NIC triggers
+this to notify a CPU when new packets arrive on the given queue. The
+signaling path for PCIe devices uses message signaled interrupts (MSI-X),
+that can route each interrupt to a particular CPU. The active mapping
+of queues to IRQs can be determined from /proc/interrupts. By default,
+an IRQ may be handled on any CPU. Because a non-negligible part of packet
+processing takes place in receive interrupt handling, it is advantageous
+to spread receive interrupts between CPUs. To manually adjust the IRQ
+affinity of each interrupt see Documentation/IRQ-affinity. Some systems
+will be running irqbalance, a daemon that dynamically optimizes IRQ
+assignments and as a result may override any manual settings.
+
+== Suggested Configuration
+
+RSS should be enabled when latency is a concern or whenever receive
+interrupt processing forms a bottleneck. Spreading load between CPUs
+decreases queue length. For low latency networking, the optimal setting
+is to allocate as many queues as there are CPUs in the system (or the
+NIC maximum, if lower). Because the aggregate number of interrupts grows
+with each additional queue, the most efficient high-rate configuration
+is likely the one with the smallest number of receive queues where no
+CPU that processes receive interrupts reaches 100% utilization. Per-cpu
+load can be observed using the mpstat utility.
+
+
+RPS: Receive Packet Steering
+============================
+
+Receive Packet Steering (RPS) is logically a software implementation of
+RSS. Being in software, it is necessarily called later in the datapath.
+Whereas RSS selects the queue and hence CPU that will run the hardware
+interrupt handler, RPS selects the CPU to perform protocol processing
+above the interrupt handler. This is accomplished by placing the packet
+on the desired CPU’s backlog queue and waking up the CPU for processing.
+RPS has some advantages over RSS: 1) it can be used with any NIC,
+2) software filters can easily be added to hash over new protocols,
+3) it does not increase hardware device interrupt rate (although it does
+introduce inter-processor interrupts (IPIs)).
+
+RPS is called during bottom half of the receive interrupt handler, when
+a driver sends a packet up the network stack with netif_rx() or
+netif_receive_skb(). These call the get_rps_cpu() function, which
+selects the queue that should process a packet.
+
+The first step in determining the target CPU for RPS is to calculate a
+flow hash over the packet’s addresses or ports (2-tuple or 4-tuple hash
+depending on the protocol). This serves as a consistent hash of the
+associated flow of the packet. The hash is either provided by hardware
+or will be computed in the stack. Capable hardware can pass the hash in
+the receive descriptor for the packet; this would usually be the same
+hash used for RSS (e.g. computed Toeplitz hash). The hash is saved in
+skb->rx_hash and can be used elsewhere in the stack as a hash of the
+packet’s flow.
+
+Each receive hardware queue has an associated list of CPUs to which
+RPS may enqueue packets for processing. For each received packet,
+an index into the list is computed from the flow hash modulo the size
+of the list. The indexed CPU is the target for processing the packet,
+and the packet is queued to the tail of that CPU’s backlog queue. At
+the end of the bottom half routine, IPIs are sent to any CPUs for which
+packets have been queued to their backlog queue. The IPI wakes backlog
+processing on the remote CPU, and any queued packets are then processed
+up the networking stack.
+
+==== RPS Configuration
+
+RPS requires a kernel compiled with the CONFIG_RPS kconfig symbol (on
+by default for SMP). Even when compiled in, RPS remains disabled until
+explicitly configured. The list of CPUs to which RPS may forward traffic
+can be configured for each receive queue using a sysfs file entry:
+
+ /sys/class/net/<dev>/queues/rx-<n>/rps_cpus
+
+This file implements a bitmap of CPUs. RPS is disabled when it is zero
+(the default), in which case packets are processed on the interrupting
+CPU. Documentation/IRQ-affinity.txt explains how CPUs are assigned to
+the bitmap.
+
+== Suggested Configuration
+
+For a single queue device, a typical RPS configuration would be to set
+the rps_cpus to the CPUs in the same cache domain of the interrupting
+CPU. If NUMA locality is not an issue, this could also be all CPUs in
+the system. At high interrupt rate, it might be wise to exclude the
+interrupting CPU from the map since that already performs much work.
+
+For a multi-queue system, if RSS is configured so that a hardware
+receive queue is mapped to each CPU, then RPS is probably redundant
+and unnecessary. If there are fewer hardware queues than CPUs, then
+RPS might be beneficial if the rps_cpus for each queue are the ones that
+share the same cache domain as the interrupting CPU for that queue.
+
+
+RFS: Receive Flow Steering
+==========================
+
+While RPS steers packets solely based on hash, and thus generally
+provides good load distribution, it does not take into account
+application locality. This is accomplished by Receive Flow Steering
+(RFS). The goal of RFS is to increase datacache hitrate by steering
+kernel processing of packets to the CPU where the application thread
+consuming the packet is running. RFS relies on the same RPS mechanisms
+to enqueue packets onto the backlog of another CPU and to wake up that
+CPU.
+
+In RFS, packets are not forwarded directly by the value of their hash,
+but the hash is used as index into a flow lookup table. This table maps
+flows to the CPUs where those flows are being processed. The flow hash
+(see RPS section above) is used to calculate the index into this table.
+The CPU recorded in each entry is the one which last processed the flow.
+If an entry does not hold a valid CPU, then packets mapped to that entry
+are steered using plain RPS. Multiple table entries may point to the
+same CPU. Indeed, with many flows and few CPUs, it is very likely that
+a single application thread handles flows with many different flow hashes.
+
+rps_sock_table is a global flow table that contains the *desired* CPU for
+flows: the CPU that is currently processing the flow in userspace. Each
+table value is a CPU index that is updated during calls to recvmsg and
+sendmsg (specifically, inet_recvmsg(), inet_sendmsg(), inet_sendpage()
+and tcp_splice_read()).
+
+When the scheduler moves a thread to a new CPU while it has outstanding
+receive packets on the old CPU, packets may arrive out of order. To
+avoid this, RFS uses a second flow table to track outstanding packets
+for each flow: rps_dev_flow_table is a table specific to each hardware
+receive queue of each device. Each table value stores a CPU index and a
+counter. The CPU index represents the *current* CPU onto which packets
+for this flow are enqueued for further kernel processing. Ideally, kernel
+and userspace processing occur on the same CPU, and hence the CPU index
+in both tables is identical. This is likely false if the scheduler has
+recently migrated a userspace thread while the kernel still has packets
+enqueued for kernel processing on the old CPU.
+
+The counter in rps_dev_flow_table values records the length of the current
+CPU's backlog when a packet in this flow was last enqueued. Each backlog
+queue has a head counter that is incremented on dequeue. A tail counter
+is computed as head counter + queue length. In other words, the counter
+in rps_dev_flow_table[i] records the last element in flow i that has
+been enqueued onto the currently designated CPU for flow i (of course,
+entry i is actually selected by hash and multiple flows may hash to the
+same entry i).
+
+And now the trick for avoiding out of order packets: when selecting the
+CPU for packet processing (from get_rps_cpu()) the rps_sock_flow table
+and the rps_dev_flow table of the queue that the packet was received on
+are compared. If the desired CPU for the flow (found in the
+rps_sock_flow table) matches the current CPU (found in the rps_dev_flow
+table), the packet is enqueued onto that CPU’s backlog. If they differ,
+the current CPU is updated to match the desired CPU if one of the
+following is true:
+
+- The current CPU's queue head counter >= the recorded tail counter
+ value in rps_dev_flow[i]
+- The current CPU is unset (equal to NR_CPUS)
+- The current CPU is offline
+
+After this check, the packet is sent to the (possibly updated) current
+CPU. These rules aim to ensure that a flow only moves to a new CPU when
+there are no packets outstanding on the old CPU, as the outstanding
+packets could arrive later than those about to be processed on the new
+CPU.
+
+==== RFS Configuration
+
+RFS is only available if the kconfig symbol CONFIG_RFS is enabled (on
+by default for SMP). The functionality remains disabled until explicitly
+configured. The number of entries in the global flow table is set through:
+
+ /proc/sys/net/core/rps_sock_flow_entries
+
+The number of entries in the per-queue flow table are set through:
+
+ /sys/class/net/<dev>/queues/tx-<n>/rps_flow_cnt
+
+== Suggested Configuration
+
+Both of these need to be set before RFS is enabled for a receive queue.
+Values for both are rounded up to the nearest power of two. The
+suggested flow count depends on the expected number of active connections
+at any given time, which may be significantly less than the number of open
+connections. We have found that a value of 32768 for rps_sock_flow_entries
+works fairly well on a moderately loaded server.
+
+For a single queue device, the rps_flow_cnt value for the single queue
+would normally be configured to the same value as rps_sock_flow_entries.
+For a multi-queue device, the rps_flow_cnt for each queue might be
+configured as rps_sock_flow_entries / N, where N is the number of
+queues. So for instance, if rps_flow_entries is set to 32768 and there
+are 16 configured receive queues, rps_flow_cnt for each queue might be
+configured as 2048.
+
+
+Accelerated RFS
+===============
+
+Accelerated RFS is to RFS what RSS is to RPS: a hardware-accelerated load
+balancing mechanism that uses soft state to steer flows based on where
+the application thread consuming the packets of each flow is running.
+Accelerated RFS should perform better than RFS since packets are sent
+directly to a CPU local to the thread consuming the data. The target CPU
+will either be the same CPU where the application runs, or at least a CPU
+which is local to the application thread’s CPU in the cache hierarchy.
+
+To enable accelerated RFS, the networking stack calls the
+ndo_rx_flow_steer driver function to communicate the desired hardware
+queue for packets matching a particular flow. The network stack
+automatically calls this function every time a flow entry in
+rps_dev_flow_table is updated. The driver in turn uses a device specific
+method to program the NIC to steer the packets.
+
+The hardware queue for a flow is derived from the CPU recorded in
+rps_dev_flow_table. The stack consults a CPU to hardware queue map which
+is maintained by the NIC driver. This is an auto-generated reverse map of
+the IRQ affinity table shown by /proc/interrupts. Drivers can use
+functions in the cpu_rmap (“CPU affinity reverse map”) kernel library
+to populate the map. For each CPU, the corresponding queue in the map is
+set to be one whose processing CPU is closest in cache locality.
+
+==== Accelerated RFS Configuration
+
+Accelerated RFS is only available if the kernel is compiled with
+CONFIG_RFS_ACCEL and support is provided by the NIC device and driver.
+It also requires that ntuple filtering is enabled via ethtool. The map
+of CPU to queues is automatically deduced from the IRQ affinities
+configured for each receive queue by the driver, so no additional
+configuration should be necessary.
+
+== Suggested Configuration
+
+This technique should be enabled whenever one wants to use RFS and the
+NIC supports hardware acceleration.
+
+XPS: Transmit Packet Steering
+=============================
+
+Transmit Packet Steering is a mechanism for intelligently selecting
+which transmit queue to use when transmitting a packet on a multi-queue
+device. To accomplish this, a mapping from CPU to hardware queue(s) is
+recorded. The goal of this mapping is usually to assign queues
+exclusively to a subset of CPUs, where the transmit completions for
+these queues are processed on a CPU within this set. This choice
+provides two benefits. First, contention on the device queue lock is
+significantly reduced since fewer CPUs contend for the same queue
+(contention can be eliminated completely if each CPU has its own
+transmit queue). Secondly, cache miss rate on transmit completion is
+reduced, in particular for data cache lines that hold the sk_buff
+structures.
+
+XPS is configured per transmit queue by setting a bitmap of CPUs that
+may use that queue to transmit. The reverse mapping, from CPUs to
+transmit queues, is computed and maintained for each network device.
+When transmitting the first packet in a flow, the function
+get_xps_queue() is called to select a queue. This function uses the ID
+of the running CPU as a key into the CPU-to-queue lookup table. If the
+ID matches a single queue, that is used for transmission. If multiple
+queues match, one is selected by using the flow hash to compute an index
+into the set.
+
+The queue chosen for transmitting a particular flow is saved in the
+corresponding socket structure for the flow (e.g. a TCP connection).
+This transmit queue is used for subsequent packets sent on the flow to
+prevent out of order (ooo) packets. The choice also amortizes the cost
+of calling get_xps_queues() over all packets in the connection. To avoid
+ooo packets, the queue for a flow can subsequently only be changed if
+skb->ooo_okay is set for a packet in the flow. This flag indicates that
+there are no outstanding packets in the flow, so the transmit queue can
+change without the risk of generating out of order packets. The
+transport layer is responsible for setting ooo_okay appropriately. TCP,
+for instance, sets the flag when all data for a connection has been
+acknowledged.
+
+==== XPS Configuration
+
+XPS is only available if the kconfig symbol CONFIG_XPS is enabled (on by
+default for SMP). The functionality remains disabled until explicitly
+configured. To enable XPS, the bitmap of CPUs that may use a transmit
+queue is configured using the sysfs file entry:
+
+/sys/class/net/<dev>/queues/tx-<n>/xps_cpus
+
+== Suggested Configuration
+
+For a network device with a single transmission queue, XPS configuration
+has no effect, since there is no choice in this case. In a multi-queue
+system, XPS is preferably configured so that each CPU maps onto one queue.
+If there are as many queues as there are CPUs in the system, then each
+queue can also map onto one CPU, resulting in exclusive pairings that
+experience no contention. If there are fewer queues than CPUs, then the
+best CPUs to share a given queue are probably those that share the cache
+with the CPU that processes transmit completions for that queue
+(transmit interrupts).
+
+
+Further Information
+===================
+RPS and RFS were introduced in kernel 2.6.35. XPS was incorporated into
+2.6.38. Original patches were submitted by Tom Herbert
+(therbert@google.com)
+
+Accelerated RFS was introduced in 2.6.35. Original patches were
+submitted by Ben Hutchings (bhutchings@solarflare.com)
+
+Authors:
+Tom Herbert (therbert@google.com)
+Willem de Bruijn (willemb@google.com)
diff --git a/drivers/net/bnx2x/bnx2x_cmn.c b/drivers/net/bnx2x/bnx2x_cmn.c
index d724a18b528..37e5790681a 100644
--- a/drivers/net/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/bnx2x/bnx2x_cmn.c
@@ -63,8 +63,9 @@ static inline void bnx2x_bz_fp(struct bnx2x *bp, int index)
fp->disable_tpa = ((bp->flags & TPA_ENABLE_FLAG) == 0);
#ifdef BCM_CNIC
- /* We don't want TPA on FCoE, FWD and OOO L2 rings */
- bnx2x_fcoe(bp, disable_tpa) = 1;
+ /* We don't want TPA on an FCoE L2 ring */
+ if (IS_FCOE_FP(fp))
+ fp->disable_tpa = 1;
#endif
}
@@ -1404,10 +1405,9 @@ void bnx2x_netif_stop(struct bnx2x *bp, int disable_hw)
u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb)
{
struct bnx2x *bp = netdev_priv(dev);
+
#ifdef BCM_CNIC
- if (NO_FCOE(bp))
- return skb_tx_hash(dev, skb);
- else {
+ if (!NO_FCOE(bp)) {
struct ethhdr *hdr = (struct ethhdr *)skb->data;
u16 ether_type = ntohs(hdr->h_proto);
@@ -1424,8 +1424,7 @@ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb)
return bnx2x_fcoe_tx(bp, txq_index);
}
#endif
- /* Select a none-FCoE queue: if FCoE is enabled, exclude FCoE L2 ring
- */
+ /* select a non-FCoE queue */
return __skb_tx_hash(dev, skb, BNX2X_NUM_ETH_QUEUES(bp));
}
@@ -1448,6 +1447,28 @@ void bnx2x_set_num_queues(struct bnx2x *bp)
bp->num_queues += NON_ETH_CONTEXT_USE;
}
+/**
+ * bnx2x_set_real_num_queues - configure netdev->real_num_[tx,rx]_queues
+ *
+ * @bp: Driver handle
+ *
+ * We currently support for at most 16 Tx queues for each CoS thus we will
+ * allocate a multiple of 16 for ETH L2 rings according to the value of the
+ * bp->max_cos.
+ *
+ * If there is an FCoE L2 queue the appropriate Tx queue will have the next
+ * index after all ETH L2 indices.
+ *
+ * If the actual number of Tx queues (for each CoS) is less than 16 then there
+ * will be the holes at the end of each group of 16 ETh L2 indices (0..15,
+ * 16..31,...) with indicies that are not coupled with any real Tx queue.
+ *
+ * The proper configuration of skb->queue_mapping is handled by
+ * bnx2x_select_queue() and __skb_tx_hash().
+ *
+ * bnx2x_setup_tc() takes care of the proper TC mappings so that __skb_tx_hash()
+ * will return a proper Tx index if TC is enabled (netdev->num_tc > 0).
+ */
static inline int bnx2x_set_real_num_queues(struct bnx2x *bp)
{
int rc, tx, rx;
diff --git a/drivers/net/bnx2x/bnx2x_dcb.c b/drivers/net/bnx2x/bnx2x_dcb.c
index a4ea35f6a45..a1e004a82f7 100644
--- a/drivers/net/bnx2x/bnx2x_dcb.c
+++ b/drivers/net/bnx2x/bnx2x_dcb.c
@@ -920,7 +920,7 @@ static void bnx2x_dcbx_admin_mib_updated_params(struct bnx2x *bp,
void bnx2x_dcbx_set_state(struct bnx2x *bp, bool dcb_on, u32 dcbx_enabled)
{
- if (!CHIP_IS_E1x(bp)) {
+ if (!CHIP_IS_E1x(bp) && !CHIP_IS_E3(bp)) {
bp->dcb_state = dcb_on;
bp->dcbx_enabled = dcbx_enabled;
} else {
diff --git a/drivers/net/bnx2x/bnx2x_main.c b/drivers/net/bnx2x/bnx2x_main.c
index 15070911154..f74582a22c6 100644
--- a/drivers/net/bnx2x/bnx2x_main.c
+++ b/drivers/net/bnx2x/bnx2x_main.c
@@ -5798,6 +5798,12 @@ static int bnx2x_init_hw_common(struct bnx2x *bp)
DP(BNX2X_MSG_MCP, "starting common init func %d\n", BP_ABS_FUNC(bp));
+ /*
+ * take the UNDI lock to protect undi_unload flow from accessing
+ * registers while we're resetting the chip
+ */
+ bnx2x_acquire_hw_lock(bp, HW_LOCK_RESOURCE_UNDI);
+
bnx2x_reset_common(bp);
REG_WR(bp, GRCBASE_MISC + MISC_REGISTERS_RESET_REG_1_SET, 0xffffffff);
@@ -5808,6 +5814,8 @@ static int bnx2x_init_hw_common(struct bnx2x *bp)
}
REG_WR(bp, GRCBASE_MISC + MISC_REGISTERS_RESET_REG_2_SET, val);
+ bnx2x_release_hw_lock(bp, HW_LOCK_RESOURCE_UNDI);
+
bnx2x_init_block(bp, BLOCK_MISC, PHASE_COMMON);
if (!CHIP_IS_E1x(bp)) {
@@ -10251,10 +10259,17 @@ static int __devinit bnx2x_init_dev(struct pci_dev *pdev,
/* clean indirect addresses */
pci_write_config_dword(bp->pdev, PCICFG_GRC_ADDRESS,
PCICFG_VENDOR_ID_OFFSET);
- REG_WR(bp, PXP2_REG_PGL_ADDR_88_F0 + BP_PORT(bp)*16, 0);
- REG_WR(bp, PXP2_REG_PGL_ADDR_8C_F0 + BP_PORT(bp)*16, 0);
- REG_WR(bp, PXP2_REG_PGL_ADDR_90_F0 + BP_PORT(bp)*16, 0);
- REG_WR(bp, PXP2_REG_PGL_ADDR_94_F0 + BP_PORT(bp)*16, 0);
+ /* Clean the following indirect addresses for all functions since it
+ * is not used by the driver.
+ */
+ REG_WR(bp, PXP2_REG_PGL_ADDR_88_F0, 0);
+ REG_WR(bp, PXP2_REG_PGL_ADDR_8C_F0, 0);
+ REG_WR(bp, PXP2_REG_PGL_ADDR_90_F0, 0);
+ REG_WR(bp, PXP2_REG_PGL_ADDR_94_F0, 0);
+ REG_WR(bp, PXP2_REG_PGL_ADDR_88_F1, 0);
+ REG_WR(bp, PXP2_REG_PGL_ADDR_8C_F1, 0);
+ REG_WR(bp, PXP2_REG_PGL_ADDR_90_F1, 0);
+ REG_WR(bp, PXP2_REG_PGL_ADDR_94_F1, 0);
/*
* Enable internal target-read (in case we are probed after PF FLR).
diff --git a/drivers/net/bnx2x/bnx2x_reg.h b/drivers/net/bnx2x/bnx2x_reg.h
index 27b5ecb1183..40266c14e6d 100644
--- a/drivers/net/bnx2x/bnx2x_reg.h
+++ b/drivers/net/bnx2x/bnx2x_reg.h
@@ -3007,11 +3007,27 @@
/* [R 6] Debug only: Number of used entries in the data FIFO */
#define PXP2_REG_HST_DATA_FIFO_STATUS