diff options
Diffstat (limited to 'drivers/net/cxgb4/sge.c')
-rw-r--r-- | drivers/net/cxgb4/sge.c | 2431 |
1 files changed, 2431 insertions, 0 deletions
diff --git a/drivers/net/cxgb4/sge.c b/drivers/net/cxgb4/sge.c new file mode 100644 index 00000000000..14adc58e71c --- /dev/null +++ b/drivers/net/cxgb4/sge.c @@ -0,0 +1,2431 @@ +/* + * This file is part of the Chelsio T4 Ethernet driver for Linux. + * + * Copyright (c) 2003-2010 Chelsio Communications, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/dma-mapping.h> +#include <linux/jiffies.h> +#include <net/ipv6.h> +#include <net/tcp.h> +#include "cxgb4.h" +#include "t4_regs.h" +#include "t4_msg.h" +#include "t4fw_api.h" + +/* + * Rx buffer size. We use largish buffers if possible but settle for single + * pages under memory shortage. + */ +#if PAGE_SHIFT >= 16 +# define FL_PG_ORDER 0 +#else +# define FL_PG_ORDER (16 - PAGE_SHIFT) +#endif + +/* RX_PULL_LEN should be <= RX_COPY_THRES */ +#define RX_COPY_THRES 256 +#define RX_PULL_LEN 128 + +/* + * Main body length for sk_buffs used for Rx Ethernet packets with fragments. + * Should be >= RX_PULL_LEN but possibly bigger to give pskb_may_pull some room. + */ +#define RX_PKT_SKB_LEN 512 + +/* Ethernet header padding prepended to RX_PKTs */ +#define RX_PKT_PAD 2 + +/* + * Max number of Tx descriptors we clean up at a time. Should be modest as + * freeing skbs isn't cheap and it happens while holding locks. We just need + * to free packets faster than they arrive, we eventually catch up and keep + * the amortized cost reasonable. Must be >= 2 * TXQ_STOP_THRES. + */ +#define MAX_TX_RECLAIM 16 + +/* + * Max number of Rx buffers we replenish at a time. Again keep this modest, + * allocating buffers isn't cheap either. + */ +#define MAX_RX_REFILL 16U + +/* + * Period of the Rx queue check timer. This timer is infrequent as it has + * something to do only when the system experiences severe memory shortage. + */ +#define RX_QCHECK_PERIOD (HZ / 2) + +/* + * Period of the Tx queue check timer. + */ +#define TX_QCHECK_PERIOD (HZ / 2) + +/* + * Max number of Tx descriptors to be reclaimed by the Tx timer. + */ +#define MAX_TIMER_TX_RECLAIM 100 + +/* + * Timer index used when backing off due to memory shortage. + */ +#define NOMEM_TMR_IDX (SGE_NTIMERS - 1) + +/* + * An FL with <= FL_STARVE_THRES buffers is starving and a periodic timer will + * attempt to refill it. + */ +#define FL_STARVE_THRES 4 + +/* + * Suspend an Ethernet Tx queue with fewer available descriptors than this. + * This is the same as calc_tx_descs() for a TSO packet with + * nr_frags == MAX_SKB_FRAGS. + */ +#define ETHTXQ_STOP_THRES \ + (1 + DIV_ROUND_UP((3 * MAX_SKB_FRAGS) / 2 + (MAX_SKB_FRAGS & 1), 8)) + +/* + * Suspension threshold for non-Ethernet Tx queues. We require enough room + * for a full sized WR. + */ +#define TXQ_STOP_THRES (SGE_MAX_WR_LEN / sizeof(struct tx_desc)) + +/* + * Max Tx descriptor space we allow for an Ethernet packet to be inlined + * into a WR. + */ +#define MAX_IMM_TX_PKT_LEN 128 + +/* + * Max size of a WR sent through a control Tx queue. + */ +#define MAX_CTRL_WR_LEN SGE_MAX_WR_LEN + +enum { + /* packet alignment in FL buffers */ + FL_ALIGN = L1_CACHE_BYTES < 32 ? 32 : L1_CACHE_BYTES, + /* egress status entry size */ + STAT_LEN = L1_CACHE_BYTES > 64 ? 128 : 64 +}; + +struct tx_sw_desc { /* SW state per Tx descriptor */ + struct sk_buff *skb; + struct ulptx_sgl *sgl; +}; + +struct rx_sw_desc { /* SW state per Rx descriptor */ + struct page *page; + dma_addr_t dma_addr; +}; + +/* + * The low bits of rx_sw_desc.dma_addr have special meaning. + */ +enum { + RX_LARGE_BUF = 1 << 0, /* buffer is larger than PAGE_SIZE */ + RX_UNMAPPED_BUF = 1 << 1, /* buffer is not mapped */ +}; + +static inline dma_addr_t get_buf_addr(const struct rx_sw_desc *d) +{ + return d->dma_addr & ~(dma_addr_t)(RX_LARGE_BUF | RX_UNMAPPED_BUF); +} + +static inline bool is_buf_mapped(const struct rx_sw_desc *d) +{ + return !(d->dma_addr & RX_UNMAPPED_BUF); +} + +/** + * txq_avail - return the number of available slots in a Tx queue + * @q: the Tx queue + * + * Returns the number of descriptors in a Tx queue available to write new + * packets. + */ +static inline unsigned int txq_avail(const struct sge_txq *q) +{ + return q->size - 1 - q->in_use; +} + +/** + * fl_cap - return the capacity of a free-buffer list + * @fl: the FL + * + * Returns the capacity of a free-buffer list. The capacity is less than + * the size because one descriptor needs to be left unpopulated, otherwise + * HW will think the FL is empty. + */ +static inline unsigned int fl_cap(const struct sge_fl *fl) +{ + return fl->size - 8; /* 1 descriptor = 8 buffers */ +} + +static inline bool fl_starving(const struct sge_fl *fl) +{ + return fl->avail - fl->pend_cred <= FL_STARVE_THRES; +} + +static int map_skb(struct device *dev, const struct sk_buff *skb, + dma_addr_t *addr) +{ + const skb_frag_t *fp, *end; + const struct skb_shared_info *si; + + *addr = dma_map_single(dev, skb->data, skb_headlen(skb), DMA_TO_DEVICE); + if (dma_mapping_error(dev, *addr)) + goto out_err; + + si = skb_shinfo(skb); + end = &si->frags[si->nr_frags]; + + for (fp = si->frags; fp < end; fp++) { + *++addr = dma_map_page(dev, fp->page, fp->page_offset, fp->size, + DMA_TO_DEVICE); + if (dma_mapping_error(dev, *addr)) + goto unwind; + } + return 0; + +unwind: + while (fp-- > si->frags) + dma_unmap_page(dev, *--addr, fp->size, DMA_TO_DEVICE); + + dma_unmap_single(dev, addr[-1], skb_headlen(skb), DMA_TO_DEVICE); +out_err: + return -ENOMEM; +} + +#ifdef CONFIG_NEED_DMA_MAP_STATE +static void unmap_skb(struct device *dev, const struct sk_buff *skb, + const dma_addr_t *addr) +{ + const skb_frag_t *fp, *end; + const struct skb_shared_info *si; + + dma_unmap_single(dev, *addr++, skb_headlen(skb), DMA_TO_DEVICE); + + si = skb_shinfo(skb); + end = &si->frags[si->nr_frags]; + for (fp = si->frags; fp < end; fp++) + dma_unmap_page(dev, *addr++, fp->size, DMA_TO_DEVICE); +} + +/** + * deferred_unmap_destructor - unmap a packet when it is freed + * @skb: the packet + * + * This is the packet destructor used for Tx packets that need to remain + * mapped until they are freed rather than until their Tx descriptors are + * freed. + */ +static void deferred_unmap_destructor(struct sk_buff *skb) +{ + unmap_skb(skb->dev->dev.parent, skb, (dma_addr_t *)skb->head); +} +#endif + +static void unmap_sgl(struct device *dev, const struct sk_buff *skb, + const struct ulptx_sgl *sgl, const struct sge_txq *q) +{ + const struct ulptx_sge_pair *p; + unsigned int nfrags = skb_shinfo(skb)->nr_frags; + + if (likely(skb_headlen(skb))) + dma_unmap_single(dev, be64_to_cpu(sgl->addr0), ntohl(sgl->len0), + DMA_TO_DEVICE); + else { + dma_unmap_page(dev, be64_to_cpu(sgl->addr0), ntohl(sgl->len0), + DMA_TO_DEVICE); + nfrags--; + } + + /* + * the complexity below is because of the possibility of a wrap-around + * in the middle of an SGL + */ + for (p = sgl->sge; nfrags >= 2; nfrags -= 2) { + if (likely((u8 *)(p + 1) <= (u8 *)q->stat)) { +unmap: dma_unmap_page(dev, be64_to_cpu(p->addr[0]), + ntohl(p->len[0]), DMA_TO_DEVICE); + dma_unmap_page(dev, be64_to_cpu(p->addr[1]), + ntohl(p->len[1]), DMA_TO_DEVICE); + p++; + } else if ((u8 *)p == (u8 *)q->stat) { + p = (const struct ulptx_sge_pair *)q->desc; + goto unmap; + } else if ((u8 *)p + 8 == (u8 *)q->stat) { + const __be64 *addr = (const __be64 *)q->desc; + + dma_unmap_page(dev, be64_to_cpu(addr[0]), + ntohl(p->len[0]), DMA_TO_DEVICE); + dma_unmap_page(dev, be64_to_cpu(addr[1]), + ntohl(p->len[1]), DMA_TO_DEVICE); + p = (const struct ulptx_sge_pair *)&addr[2]; + } else { + const __be64 *addr = (const __be64 *)q->desc; + + dma_unmap_page(dev, be64_to_cpu(p->addr[0]), + ntohl(p->len[0]), DMA_TO_DEVICE); + dma_unmap_page(dev, be64_to_cpu(addr[0]), + ntohl(p->len[1]), DMA_TO_DEVICE); + p = (const struct ulptx_sge_pair *)&addr[1]; + } + } + if (nfrags) { + __be64 addr; + + if ((u8 *)p == (u8 *)q->stat) + p = (const struct ulptx_sge_pair *)q->desc; + addr = (u8 *)p + 16 <= (u8 *)q->stat ? p->addr[0] : + *(const __be64 *)q->desc; + dma_unmap_page(dev, be64_to_cpu(addr), ntohl(p->len[0]), + DMA_TO_DEVICE); + } +} + +/** + * free_tx_desc - reclaims Tx descriptors and their buffers + * @adapter: the adapter + * @q: the Tx queue to reclaim descriptors from + * @n: the number of descriptors to reclaim + * @unmap: whether the buffers should be unmapped for DMA + * + * Reclaims Tx descriptors from an SGE Tx queue and frees the associated + * Tx buffers. Called with the Tx queue lock held. + */ +static void free_tx_desc(struct adapter *adap, struct sge_txq *q, + unsigned int n, bool unmap) +{ + struct tx_sw_desc *d; + unsigned int cidx = q->cidx; + struct device *dev = adap->pdev_dev; + + d = &q->sdesc[cidx]; + while (n--) { + if (d->skb) { /* an SGL is present */ + if (unmap) + unmap_sgl(dev, d->skb, d->sgl, q); + kfree_skb(d->skb); + d->skb = NULL; + } + ++d; + if (++cidx == q->size) { + cidx = 0; + d = q->sdesc; + } + } + q->cidx = cidx; +} + +/* + * Return the number of reclaimable descriptors in a Tx queue. + */ +static inline int reclaimable(const struct sge_txq *q) +{ + int hw_cidx = ntohs(q->stat->cidx); + hw_cidx -= q->cidx; + return hw_cidx < 0 ? hw_cidx + q->size : hw_cidx; +} + +/** + * reclaim_completed_tx - reclaims completed Tx descriptors + * @adap: the adapter + * @q: the Tx queue to reclaim completed descriptors from + * @unmap: whether the buffers should be unmapped for DMA + * + * Reclaims Tx descriptors that the SGE has indicated it has processed, + * and frees the associated buffers if possible. Called with the Tx + * queue locked. + */ +static inline void reclaim_completed_tx(struct adapter *adap, struct sge_txq *q, + bool unmap) +{ + int avail = reclaimable(q); + + if (avail) { + /* + * Limit the amount of clean up work we do at a time to keep + * the Tx lock hold time O(1). + */ + if (avail > MAX_TX_RECLAIM) + avail = MAX_TX_RECLAIM; + + free_tx_desc(adap, q, avail, unmap); + q->in_use -= avail; + } +} + +static inline int get_buf_size(const struct rx_sw_desc *d) +{ +#if FL_PG_ORDER > 0 + return (d->dma_addr & RX_LARGE_BUF) ? (PAGE_SIZE << FL_PG_ORDER) : + PAGE_SIZE; +#else + return PAGE_SIZE; +#endif +} + +/** + * free_rx_bufs - free the Rx buffers on an SGE free list + * @adap: the adapter + * @q: the SGE free list to free buffers from + * @n: how many buffers to free + * + * Release the next @n buffers on an SGE free-buffer Rx queue. The + * buffers must be made inaccessible to HW before calling this function. + */ +static void free_rx_bufs(struct adapter *adap, struct sge_fl *q, int n) +{ + while (n--) { + struct rx_sw_desc *d = &q->sdesc[q->cidx]; + + if (is_buf_mapped(d)) + dma_unmap_page(adap->pdev_dev, get_buf_addr(d), + get_buf_size(d), PCI_DMA_FROMDEVICE); + put_page(d->page); + d->page = NULL; + if (++q->cidx == q->size) + q->cidx = 0; + q->avail--; + } +} + +/** + * unmap_rx_buf - unmap the current Rx buffer on an SGE free list + * @adap: the adapter + * @q: the SGE free list + * + * Unmap the current buffer on an SGE free-buffer Rx queue. The + * buffer must be made inaccessible to HW before calling this function. + * + * This is similar to @free_rx_bufs above but does not free the buffer. + * Do note that the FL still loses any further access to the buffer. + */ +static void unmap_rx_buf(struct adapter *adap, struct sge_fl *q) +{ + struct rx_sw_desc *d = &q->sdesc[q->cidx]; + + if (is_buf_mapped(d)) + dma_unmap_page(adap->pdev_dev, get_buf_addr(d), + get_buf_size(d), PCI_DMA_FROMDEVICE); + d->page = NULL; + if (++q->cidx == q->size) + q->cidx = 0; + q->avail--; +} + +static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q) +{ + if (q->pend_cred >= 8) { + wmb(); + t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL), DBPRIO | + QID(q->cntxt_id) | PIDX(q->pend_cred / 8)); + q->pend_cred &= 7; + } +} + +static inline void set_rx_sw_desc(struct rx_sw_desc *sd, struct page *pg, + dma_addr_t mapping) +{ + sd->page = pg; + sd->dma_addr = mapping; /* includes size low bits */ +} + +/** + * refill_fl - refill an SGE Rx buffer ring + * @adap: the adapter + * @q: the ring to refill + * @n: the number of new buffers to allocate + * @gfp: the gfp flags for the allocations + * + * (Re)populate an SGE free-buffer queue with up to @n new packet buffers, + * allocated with the supplied gfp flags. The caller must assure that + * @n does not exceed the queue's capacity. If afterwards the queue is + * found critically low mark it as starving in the bitmap of starving FLs. + * + * Returns the number of buffers allocated. + */ +static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n, + gfp_t gfp) +{ + struct page *pg; + dma_addr_t mapping; + unsigned int cred = q->avail; + __be64 *d = &q->desc[q->pidx]; + struct rx_sw_desc *sd = &q->sdesc[q->pidx]; + + gfp |= __GFP_NOWARN; /* failures are expected */ + +#if FL_PG_ORDER > 0 + /* + * Prefer large buffers + */ + while (n) { + pg = alloc_pages(gfp | __GFP_COMP, FL_PG_ORDER); + if (unlikely(!pg)) { + q->large_alloc_failed++; + break; /* fall back to single pages */ + } + + mapping = dma_map_page(adap->pdev_dev, pg, 0, + PAGE_SIZE << FL_PG_ORDER, + PCI_DMA_FROMDEVICE); + if (unlikely(dma_mapping_error(adap->pdev_dev, mapping))) { + __free_pages(pg, FL_PG_ORDER); + goto out; /* do not try small pages for this error */ + } + mapping |= RX_LARGE_BUF; + *d++ = cpu_to_be64(mapping); + + set_rx_sw_desc(sd, pg, mapping); + sd++; + + q->avail++; + if (++q->pidx == q->size) { + q->pidx = 0; + sd = q->sdesc; + d = q->desc; + } + n--; + } +#endif + + while (n--) { + pg = __netdev_alloc_page(adap->port[0], gfp); + if (unlikely(!pg)) { + q->alloc_failed++; + break; + } + + mapping = dma_map_page(adap->pdev_dev, pg, 0, PAGE_SIZE, + PCI_DMA_FROMDEVICE); + if (unlikely(dma_mapping_error(adap->pdev_dev, mapping))) { + netdev_free_page(adap->port[0], pg); + goto out; + } + *d++ = cpu_to_be64(mapping); + + set_rx_sw_desc(sd, pg, mapping); + sd++; + + q->avail++; + if (++q->pidx == q->size) { + q->pidx = 0; + sd = q->sdesc; + d = q->desc; + } + } + +out: cred = q->avail - cred; + q->pend_cred += cred; + ring_fl_db(adap, q); + + if (unlikely(fl_starving(q))) { + smp_wmb(); + set_bit(q->cntxt_id, adap->sge.starving_fl); + } + + return cred; +} + +static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl) +{ + refill_fl(adap, fl, min(MAX_RX_REFILL, fl_cap(fl) - fl->avail), + GFP_ATOMIC); +} + +/** + * alloc_ring - allocate resources for an SGE descriptor ring + * @dev: the PCI device's core device + * @nelem: the number of descriptors + * @elem_size: the size of each descriptor + * @sw_size: the size of the SW state associated with each ring element + * @phys: the physical address of the allocated ring + * @metadata: address of the array holding the SW state for the ring + * @stat_size: extra space in HW ring for status information + * + * Allocates resources for an SGE descriptor ring, such as Tx queues, + * free buffer lists, or response queues. Each SGE ring requires + * space for its HW descriptors plus, optionally, space for the SW state + * associated with each HW entry (the metadata). The function returns + * three values: the virtual address for the HW ring (the return value + * of the function), the bus address of the HW ring, and the address + * of the SW ring. + */ +static void *alloc_ring(struct device *dev, size_t nelem, size_t elem_size, + size_t sw_size, dma_addr_t *phys, void *metadata, + size_t stat_size) +{ + size_t len = nelem * elem_size + stat_size; + void *s = NULL; + void *p = dma_alloc_coherent(dev, len, phys, GFP_KERNEL); + + if (!p) + return NULL; + if (sw_size) { + s = kcalloc(nelem, sw_size, GFP_KERNEL); + + if (!s) { + dma_free_coherent(dev, len, p, *phys); + return NULL; + } + } + if (metadata) + *(void **)metadata = s; + memset(p, 0, len); + return p; +} + +/** + * sgl_len - calculates the size of an SGL of the given capacity + * @n: the number of SGL entries + * + * Calculates the number of flits needed for a scatter/gather list that + * can hold the given number of entries. + */ +static inline unsigned int sgl_len(unsigned int n) +{ + n--; + return (3 * n) / 2 + (n & 1) + 2; +} + +/** + * flits_to_desc - returns the num of Tx descriptors for the given flits + * @n: the number of flits + * + * Returns the number of Tx descriptors needed for the supplied number + * of flits. + */ +static inline unsigned int flits_to_desc(unsigned int n) +{ + BUG_ON(n > SGE_MAX_WR_LEN / 8); + return DIV_ROUND_UP(n, 8); +} + +/** + * is_eth_imm - can an Ethernet packet be sent as immediate data? + * @skb: the packet + * + * Returns whether an Ethernet packet is small enough to fit as + * immediate data. + */ +static inline int is_eth_imm(const struct sk_buff *skb) +{ + return skb->len <= MAX_IMM_TX_PKT_LEN - sizeof(struct cpl_tx_pkt); +} + +/** + * calc_tx_flits - calculate the number of flits for a packet Tx WR + * @skb: the packet + * + * Returns the number of flits needed for a Tx WR for the given Ethernet + * packet, including the needed WR and CPL headers. + */ +static inline unsigned int calc_tx_flits(const struct sk_buff *skb) +{ + unsigned int flits; + + if (is_eth_imm(skb)) + return DIV_ROUND_UP(skb->len + sizeof(struct cpl_tx_pkt), 8); + + flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 4; + if (skb_shinfo(skb)->gso_size) + flits += 2; + return flits; +} + +/** + * calc_tx_descs - calculate the number of Tx descriptors for a packet + * @skb: the packet + * + * Returns the number of Tx descriptors needed for the given Ethernet + * packet, including the needed WR and CPL headers. + */ +static inline unsigned int calc_tx_descs(const struct sk_buff *skb) +{ + return flits_to_desc(calc_tx_flits(skb)); +} + +/** + * write_sgl - populate a scatter/gather list for a packet + * @skb: the packet + * @q: the Tx queue we are writing into + * @sgl: starting location for writing the SGL + * @end: points right after the end of the SGL + * @start: start offset into skb main-body data to include in the SGL + * @addr: the list of bus addresses for the SGL elements + * + * Generates a gather list for the buffers that make up a packet. + * The caller must provide adequate space for the SGL that will be written. + * The SGL includes all of the packet's page fragments and the data in its + * main body except for the first @start bytes. @sgl must be 16-byte + * aligned and within a Tx descriptor with available space. @end points + * right after the end of the SGL but does not account for any potential + * wrap around, i.e., @end > @sgl. + */ +static void write_sgl(const struct sk_buff *skb, struct sge_txq *q, + struct ulptx_sgl *sgl, u64 *end, unsigned int start, + const dma_addr_t *addr) +{ + unsigned int i, len; + struct ulptx_sge_pair *to; + const struct skb_shared_info *si = skb_shinfo(skb); + unsigned int nfrags = si->nr_frags; + struct ulptx_sge_pair buf[MAX_SKB_FRAGS / 2 + 1]; + + len = skb_headlen(skb) - start; + if (likely(len)) { + sgl->len0 = htonl(len); + sgl->addr0 = cpu_to_be64(addr[0] + start); + nfrags++; + } else { + sgl->len0 = htonl(si->frags[0].size); + sgl->addr0 = cpu_to_be64(addr[1]); + } + + sgl->cmd_nsge = htonl(ULPTX_CMD(ULP_TX_SC_DSGL) | ULPTX_NSGE(nfrags)); + if (likely(--nfrags == 0)) + return; + /* + * Most of the complexity below deals with the possibility we hit the + * end of the queue in the middle of writing the SGL. For this case + * only we create the SGL in a temporary buffer and then copy it. + */ + to = (u8 *)end > (u8 *)q->stat ? buf : sgl->sge; + + for (i = (nfrags != si->nr_frags); nfrags >= 2; nfrags -= 2, to++) { + to->len[0] = cpu_to_be32(si->frags[i].size); + to->len[1] = cpu_to_be32(si->frags[++i].size); + to->addr[0] = cpu_to_be64(addr[i]); + to->addr[1] = cpu_to_be64(addr[++i]); + } + if (nfrags) { + to->len[0] = cpu_to_be32(si->frags[i].size); + to->len[1] = cpu_to_be32(0); + to->addr[0] = cpu_to_be64(addr[i + 1]); + } + if (unlikely((u8 *)end > (u8 *)q->stat)) { + unsigned int part0 = (u8 *)q->stat - (u8 *)sgl->sge, part1; + + if (likely(part0)) + memcpy(sgl->sge, buf, part0); + part1 = (u8 *)end - (u8 *)q->stat; + memcpy(q->desc, (u8 *)buf + part0, part1); + end = (void *)q->desc + part1; + } + if ((uintptr_t)end & 8) /* 0-pad to multiple of 16 */ + *(u64 *)end = 0; +} + +/** + * ring_tx_db - check and potentially ring a Tx queue's doorbell + * @adap: the adapter + * @q: the Tx queue + * @n: number of new descriptors to give to HW + * + * Ring the doorbel for a Tx queue. + */ +static inline void ring_tx_db(struct adapter *adap, struct sge_txq *q, int n) +{ + wmb(); /* write descriptors before telling HW */ + t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL), + QID(q->cntxt_id) | PIDX(n)); +} + +/** + * inline_tx_skb - inline a packet's data into Tx descriptors + * @skb: the packet + * @q: the Tx queue where the packet will be inlined + * @pos: starting position in the Tx queue where to inline the packet + * + * Inline a packet's contents directly into Tx descriptors, starting at + * the given position within the Tx DMA ring. + * Most of the complexity of this operation is dealing with wrap arounds + * in the middle of the packet we want to inline. + */ +static void inline_tx_skb(const struct sk_buff *skb, const struct sge_txq *q, + void *pos) +{ + u64 *p; + int left = (void *)q->stat - pos; + + if (likely(skb->len <= left)) { + if (likely(!skb->data_len)) + skb_copy_from_linear_data(skb, pos, skb->len); + else + skb_copy_bits(skb, 0, pos, skb->len); + pos += skb->len; + } else { + skb_copy_bits(skb, 0, pos, left); + skb_copy_bits(skb, left, q->desc, skb->len - left); + pos = (void *)q->desc + (skb->len - left); + } + + /* 0-pad to multiple of 16 */ + p = PTR_ALIGN(pos, 8); + if ((uintptr_t)p & 8) + *p = 0; +} + +/* + * Figure out what HW csum a packet wants and return the appropriate control + * bits. + */ +static u64 hwcsum(const struct sk_buff *skb) +{ + int csum_type; + const struct iphdr *iph = ip_hdr(skb); + + if (iph->version == 4) { + if (iph->protocol == IPPROTO_TCP) + csum_type = TX_CSUM_TCPIP; + else if (iph->protocol == IPPROTO_UDP) + csum_type = TX_CSUM_UDPIP; + else { +nocsum: /* + * unknown protocol, disable HW csum + * and hope a bad packet is detected + */ + return TXPKT_L4CSUM_DIS; + } + } else { + /* + * this doesn't work with extension headers + */ + const struct ipv6hdr *ip6h = (const struct ipv6hdr *)iph; + + if (ip6h->nexthdr == IPPROTO_TCP) + csum_type = TX_CSUM_TCPIP6; + else if (ip6h->nexthdr == IPPROTO_UDP) + csum_type = TX_CSUM_UDPIP6; + else + goto nocsum; + } + + if (likely(csum_type >= TX_CSUM_TCPIP)) + return TXPKT_CSUM_TYPE(csum_type) | + TXPKT_IPHDR_LEN(skb_network_header_len(skb)) | + TXPKT_ETHHDR_LEN(skb_network_offset(skb) - ETH_HLEN); + else { + int start = skb_transport_offset(skb); + + return TXPKT_CSUM_TYPE(csum_type) | TXPKT_CSUM_START(start) | + TXPKT_CSUM_LOC(start + skb->csum_offset); + } +} + +static void eth_txq_stop(struct sge_eth_txq *q) +{ + netif_tx_stop_queue(q->txq); + q->q.stops++; +} + +static inline void txq_advance(struct sge_txq *q, unsigned int n) +{ + q->in_use += n; + q->pidx += n; + if (q->pidx >= q->size) + q->pidx -= q->size; +} + +/** + * t4_eth_xmit - add a packet to an Ethernet Tx queue + * @skb: the packet + * @dev: the egress net device + * + * Add a packet to an SGE Ethernet Tx queue. Runs with softirqs disabled. + */ +netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev) +{ + u32 wr_mid; + u64 cntrl, *end; + int qidx, credits; + unsigned int flits, ndesc; + struct adapter *adap; + struct sge_eth_txq *q; + const struct port_info *pi; + struct fw_eth_tx_pkt_wr *wr; + struct cpl_tx_pkt_core *cpl; + const struct skb_shared_info *ssi; + dma_addr_t addr[MAX_SKB_FRAGS + 1]; + + /* + * The chip min packet length is 10 octets but play safe and reject + * anything shorter than an Ethernet header. + */ + if (unlikely(skb->len < ETH_HLEN)) { +out_free: dev_kfree_skb(skb); + return NETDEV_TX_OK; + } + + pi = netdev_priv(dev); + adap = pi->adapter; + qidx = skb_get_queue_mapping(skb); + q = &adap->sge.ethtxq[qidx + pi->first_qset]; + + reclaim_completed_tx(adap, &q->q, true); + + flits = calc_tx_flits(skb); + ndesc = flits_to_desc(flits); + credits = txq_avail(&q->q) - ndesc; + + if (unlikely(credits < 0)) { + eth_txq_stop(q); + dev_err(adap->pdev_dev, + "%s: Tx ring %u full while queue awake!\n", + dev->name, qidx); + return NETDEV_TX_BUSY; + } + + if (!is_eth_imm(skb) && + unlikely(map_skb(adap->pdev_dev, skb, addr) < 0)) { + q->mapping_err++; + goto out_free; + } + + wr_mid = FW_WR_LEN16(DIV_ROUND_UP(flits, 2)); + if (unlikely(credits < ETHTXQ_STOP_THRES)) { + eth_txq_stop(q); + wr_mid |= FW_WR_EQUEQ | FW_WR_EQUIQ; + } + + wr = (void *)&q->q.desc[q->q.pidx]; + wr->equiq_to_len16 = htonl(wr_mid); + wr->r3 = cpu_to_be64(0); + end = (u64 *)wr + flits; + + ssi = skb_shinfo(skb); + if (ssi->gso_size) { + struct cpl_tx_pkt_lso *lso = (void *)wr; + bool v6 = (ssi->gso_type & SKB_GSO_TCPV6) != 0; + int l3hdr_len = skb_network_header_len(skb); + int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN; + + wr->op_immdlen = htonl(FW_WR_OP(FW_ETH_TX_PKT_WR) | + FW_WR_IMMDLEN(sizeof(*lso))); + lso->lso_ctrl = htonl(LSO_OPCODE(CPL_TX_PKT_LSO) | + LSO_FIRST_SLICE | LSO_LAST_SLICE | + LSO_IPV6(v6) | + LSO_ETHHDR_LEN(eth_xtra_len / 4) | + LSO_IPHDR_LEN(l3hdr_len / 4) | + LSO_TCPHDR_LEN(tcp_hdr(skb)->doff)); + lso->ipid_ofst = htons(0); + lso->mss = htons(ssi->gso_size); + lso->seqno_offset = htonl(0); + lso->len = htonl(skb->len); + cpl = (void *)(lso + 1); + cntrl = TXPKT_CSUM_TYPE(v6 ? TX_CSUM_TCPIP6 : TX_CSUM_TCPIP) | + TXPKT_IPHDR_LEN(l3hdr_len) | + TXPKT_ETHHDR_LEN(eth_xtra_len); + q->tso++; + q->tx_cso += ssi->gso_segs; + } else { + int len; + + len = is_eth_imm(skb) ? skb->len + sizeof(*cpl) : sizeof(*cpl); + wr->op_immdlen = htonl(FW_WR_OP(FW_ETH_TX_PKT_WR) | + FW_WR_IMMDLEN(len)); + cpl = (void *)(wr + 1); + if (skb->ip_summed == CHECKSUM_PARTIAL) { + cntrl = hwcsum(skb) | TXPKT_IPCSUM_DIS; + q->tx_cso++; + } else + cntrl = TXPKT_L4CSUM_DIS | TXPKT_IPCSUM_DIS; + } + + if (vlan_tx_tag_present(skb)) { + q->vlan_ins++; + cntrl |= TXPKT_VLAN_VLD | TXPKT_VLAN(vlan_tx_tag_get(skb)); + } + + cpl->ctrl0 = htonl(TXPKT_OPCODE(CPL_TX_PKT_XT) | + TXPKT_INTF(pi->tx_chan) | TXPKT_PF(0)); + cpl->pack = htons(0); + cpl->len = htons(skb->len); + cpl->ctrl1 = cpu_to_be64(cntrl); + + if (is_eth_imm(skb)) { + inline_tx_skb(skb, &q->q, cpl + 1); + dev_kfree_skb(skb); + } else { + int last_desc; + + write_sgl(skb, &q->q, (struct ulptx_sgl *)(cpl + 1), end, 0, + addr); + skb_orphan(skb); + + last_desc = q->q.pidx + ndesc - 1; + if (last_desc >= q->q.size) + last_desc -= q->q.size; + q->q.sdesc[last_desc].skb = skb; + q->q.sdesc[last_desc].sgl = (struct ulptx_sgl *)(cpl + 1); + } + + txq_advance(&q->q, ndesc); + + ring_tx_db(adap, &q->q, ndesc); + return NETDEV_TX_OK; +} + +/** + * reclaim_completed_tx_imm - reclaim completed control-queue Tx descs + * @q: the SGE control Tx queue + * + * This is a variant of reclaim_completed_tx() that is used for Tx queues + * that send only immediate data (presently just the control queues) and + * thus do not have any sk_buffs to release. + */ +static inline void reclaim_completed_tx_imm(struct sge_txq *q) +{ + int hw_cidx = ntohs(q->stat->cidx); + int reclaim = hw_cidx - q->cidx; + + if (reclaim < 0) + reclaim += q->size; + + q->in_use -= reclaim; + q->cidx = hw_cidx; +} + +/** + * is_imm - check whether a packet can be sent as immediate data + * @skb: the packet + * + * Returns true if a packet can be sent as a WR with immediate data. + */ +static inline int is_imm(const struct sk_buff *skb) +{ + return skb->len <= MAX_CTRL_WR_LEN; +} + +/** + * ctrlq_check_stop - check if a control queue is full and should stop + * @q: the queue + * @wr: most recent WR written to the queue + * + * Check if a control queue has become full and should be stopped. + * We clean up control queue descriptors very lazily, only when we are out. + * If the queue is still full after reclaiming any completed descriptors + * we suspend it and have the last WR wake it up. + */ +static void ctrlq_check_stop(struct sge_ctrl_txq *q, struct fw_wr_hdr *wr) +{ + reclaim_completed_tx_imm(&q->q); + if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) { + wr->lo |= htonl(FW_WR_EQUEQ | FW_WR_EQUIQ); + q->q.stops++; + q->full = 1; + } +} + +/** + * ctrl_xmit - send a packet through an SGE control Tx queue + * @q: the control queue + * @skb: the packet + * + * Send a packet through an SGE control Tx queue. Packets sent through + * a control queue must fit entirely as immediate data. + */ +static int ctrl_xmit(struct sge_ctrl_txq *q, struct sk_buff *skb) +{ + unsigned int ndesc; + struct fw_wr_hdr *wr; + + if (unlikely(!is_imm(skb))) { + WARN_ON(1); + dev_kfree_skb(skb); + return NET_XMIT_DROP; + } + + ndesc = DIV_ROUND_UP(skb->len, sizeof(struct tx_desc)); + spin_lock(&q->sendq.lock); + + if (unlikely(q->full)) { + skb->priority = ndesc; /* save for restart */ + __skb_queue_tail(&q->sendq, skb); + spin_unlock(&q->sendq.lock); + return NET_XMIT_CN; + } + + wr = (struct fw_wr_hdr *)&q->q.desc[q->q.pidx]; + inline_tx_skb(skb, &q->q, wr); + + txq_advance(&q->q, ndesc); + if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) + ctrlq_check_stop(q, wr); + + ring_tx_db(q->adap, &q->q, ndesc); + spin_unlock(&q->sendq.lock); + + kfree_skb(skb); + return NET_XMIT_SUCCESS; +} + +/** + * restart_ctrlq - restart a suspended control queue + * @data: the control queue to restart + * + * Resumes transmission on a suspended Tx control queue. + */ +static void restart_ctrlq(unsigned long data) +{ + struct sk_buff *skb; + unsigned int written = 0; + struct sge_ctrl_txq *q = (struct sge_ctrl_txq *)data; + + spin_lock(&q->sendq.lock); + reclaim_completed_tx_imm(&q->q); + BUG_ON(txq_avail(&q->q) < TXQ_STOP_THRES); /* q should be empty */ + + while ((skb = __skb_dequeue(&q->sendq)) != NULL) { + struct fw_wr_hdr *wr; + unsigned int ndesc = skb->priority; /* previously saved */ + + /* + * Write descriptors and free skbs outside the lock to limit + * wait times. q->full is still set so new skbs will be queued. + */ + spin_unlock(&q->sendq.lock); + + wr = (struct fw_wr_hdr *)&q->q.desc[q->q.pidx]; + inline_tx_skb(skb, &q->q, wr); + kfree_skb(skb); + + written += ndesc; + txq_advance(&q->q, ndesc); + if (unlikely(txq_avail(&q->q) < TXQ_STOP_THRES)) { + unsigned long old = q->q.stops; + + ctrlq_check_stop(q, wr); + if (q->q.stops != old) { /* suspended anew */ + spin_lock(&q->sendq.lock); + goto ringdb; + } + } + if (written > 16) { + ring_tx_db(q->adap, &q->q, written); + written = 0; + } + spin_lock(&q->sendq.lock); + } + q->full = 0; +ringdb: if (written) + ring_tx_db(q->adap, &q->q, written); + spin_unlock(&q->sendq.lock); +} + +/** + * t4_mgmt_tx - send a management message + * @adap: the adapter + * @skb: the packet containing the management message + * + * Send a management message through control queue 0. + */ +int t4_mgmt_tx(struct adapter *adap, struct sk_buff *skb) +{ + int ret; + + local_bh_disabl |