diff options
Diffstat (limited to 'drivers/net/ethernet/chelsio/cxgb4/sge.c')
| -rw-r--r-- | drivers/net/ethernet/chelsio/cxgb4/sge.c | 619 |
1 files changed, 494 insertions, 125 deletions
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index e111d974afd..dd4355d248e 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -68,9 +68,6 @@ */ #define RX_PKT_SKB_LEN 512 -/* Ethernet header padding prepended to RX_PKTs */ -#define RX_PKT_PAD 2 - /* * Max number of Tx descriptors we clean up at a time. Should be modest as * freeing skbs isn't cheap and it happens while holding locks. We just need @@ -96,6 +93,16 @@ */ #define TX_QCHECK_PERIOD (HZ / 2) +/* SGE Hung Ingress DMA Threshold Warning time (in Hz) and Warning Repeat Rate + * (in RX_QCHECK_PERIOD multiples). If we find one of the SGE Ingress DMA + * State Machines in the same state for this amount of time (in HZ) then we'll + * issue a warning about a potential hang. We'll repeat the warning as the + * SGE Ingress DMA Channel appears to be hung every N RX_QCHECK_PERIODs till + * the situation clears. If the situation clears, we'll note that as well. + */ +#define SGE_IDMA_WARN_THRESH (1 * HZ) +#define SGE_IDMA_WARN_REPEAT (20 * RX_QCHECK_PERIOD) + /* * Max number of Tx descriptors to be reclaimed by the Tx timer. */ @@ -137,13 +144,6 @@ */ #define MAX_CTRL_WR_LEN SGE_MAX_WR_LEN -enum { - /* packet alignment in FL buffers */ - FL_ALIGN = L1_CACHE_BYTES < 32 ? 32 : L1_CACHE_BYTES, - /* egress status entry size */ - STAT_LEN = L1_CACHE_BYTES > 64 ? 128 : 64 -}; - struct tx_sw_desc { /* SW state per Tx descriptor */ struct sk_buff *skb; struct ulptx_sgl *sgl; @@ -155,16 +155,57 @@ struct rx_sw_desc { /* SW state per Rx descriptor */ }; /* - * The low bits of rx_sw_desc.dma_addr have special meaning. + * Rx buffer sizes for "useskbs" Free List buffers (one ingress packet pe skb + * buffer). We currently only support two sizes for 1500- and 9000-byte MTUs. + * We could easily support more but there doesn't seem to be much need for + * that ... + */ +#define FL_MTU_SMALL 1500 +#define FL_MTU_LARGE 9000 + +static inline unsigned int fl_mtu_bufsize(struct adapter *adapter, + unsigned int mtu) +{ + struct sge *s = &adapter->sge; + + return ALIGN(s->pktshift + ETH_HLEN + VLAN_HLEN + mtu, s->fl_align); +} + +#define FL_MTU_SMALL_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_SMALL) +#define FL_MTU_LARGE_BUFSIZE(adapter) fl_mtu_bufsize(adapter, FL_MTU_LARGE) + +/* + * Bits 0..3 of rx_sw_desc.dma_addr have special meaning. The hardware uses + * these to specify the buffer size as an index into the SGE Free List Buffer + * Size register array. We also use bit 4, when the buffer has been unmapped + * for DMA, but this is of course never sent to the hardware and is only used + * to prevent double unmappings. All of the above requires that the Free List + * Buffers which we allocate have the bottom 5 bits free (0) -- i.e. are + * 32-byte or or a power of 2 greater in alignment. Since the SGE's minimal + * Free List Buffer alignment is 32 bytes, this works out for us ... */ enum { - RX_LARGE_BUF = 1 << 0, /* buffer is larger than PAGE_SIZE */ - RX_UNMAPPED_BUF = 1 << 1, /* buffer is not mapped */ + RX_BUF_FLAGS = 0x1f, /* bottom five bits are special */ + RX_BUF_SIZE = 0x0f, /* bottom three bits are for buf sizes */ + RX_UNMAPPED_BUF = 0x10, /* buffer is not mapped */ + + /* + * XXX We shouldn't depend on being able to use these indices. + * XXX Especially when some other Master PF has initialized the + * XXX adapter or we use the Firmware Configuration File. We + * XXX should really search through the Host Buffer Size register + * XXX array for the appropriately sized buffer indices. + */ + RX_SMALL_PG_BUF = 0x0, /* small (PAGE_SIZE) page buffer */ + RX_LARGE_PG_BUF = 0x1, /* buffer large (FL_PG_ORDER) page buffer */ + + RX_SMALL_MTU_BUF = 0x2, /* small MTU buffer */ + RX_LARGE_MTU_BUF = 0x3, /* large MTU buffer */ }; static inline dma_addr_t get_buf_addr(const struct rx_sw_desc *d) { - return d->dma_addr & ~(dma_addr_t)(RX_LARGE_BUF | RX_UNMAPPED_BUF); + return d->dma_addr & ~(dma_addr_t)RX_BUF_FLAGS; } static inline bool is_buf_mapped(const struct rx_sw_desc *d) @@ -342,7 +383,7 @@ static void free_tx_desc(struct adapter *adap, struct sge_txq *q, if (d->skb) { /* an SGL is present */ if (unmap) unmap_sgl(dev, d->skb, d->sgl, q); - kfree_skb(d->skb); + dev_consume_skb_any(d->skb); d->skb = NULL; } ++d; @@ -392,14 +433,35 @@ static inline void reclaim_completed_tx(struct adapter *adap, struct sge_txq *q, } } -static inline int get_buf_size(const struct rx_sw_desc *d) +static inline int get_buf_size(struct adapter *adapter, + const struct rx_sw_desc *d) { -#if FL_PG_ORDER > 0 - return (d->dma_addr & RX_LARGE_BUF) ? (PAGE_SIZE << FL_PG_ORDER) : - PAGE_SIZE; -#else - return PAGE_SIZE; -#endif + struct sge *s = &adapter->sge; + unsigned int rx_buf_size_idx = d->dma_addr & RX_BUF_SIZE; + int buf_size; + + switch (rx_buf_size_idx) { + case RX_SMALL_PG_BUF: + buf_size = PAGE_SIZE; + break; + + case RX_LARGE_PG_BUF: + buf_size = PAGE_SIZE << s->fl_pg_order; + break; + + case RX_SMALL_MTU_BUF: + buf_size = FL_MTU_SMALL_BUFSIZE(adapter); + break; + + case RX_LARGE_MTU_BUF: + buf_size = FL_MTU_LARGE_BUFSIZE(adapter); + break; + + default: + BUG_ON(1); + } + + return buf_size; } /** @@ -418,7 +480,8 @@ static void free_rx_bufs(struct adapter *adap, struct sge_fl *q, int n) if (is_buf_mapped(d)) dma_unmap_page(adap->pdev_dev, get_buf_addr(d), - get_buf_size(d), PCI_DMA_FROMDEVICE); + get_buf_size(adap, d), + PCI_DMA_FROMDEVICE); put_page(d->page); d->page = NULL; if (++q->cidx == q->size) @@ -444,7 +507,7 @@ static void unmap_rx_buf(struct adapter *adap, struct sge_fl *q) if (is_buf_mapped(d)) dma_unmap_page(adap->pdev_dev, get_buf_addr(d), - get_buf_size(d), PCI_DMA_FROMDEVICE); + get_buf_size(adap, d), PCI_DMA_FROMDEVICE); d->page = NULL; if (++q->cidx == q->size) q->cidx = 0; @@ -453,10 +516,14 @@ static void unmap_rx_buf(struct adapter *adap, struct sge_fl *q) static inline void ring_fl_db(struct adapter *adap, struct sge_fl *q) { + u32 val; if (q->pend_cred >= 8) { + val = PIDX(q->pend_cred / 8); + if (!is_t4(adap->params.chip)) + val |= DBTYPE(1); wmb(); - t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL), DBPRIO | - QID(q->cntxt_id) | PIDX(q->pend_cred / 8)); + t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL), DBPRIO(1) | + QID(q->cntxt_id) | val); q->pend_cred &= 7; } } @@ -485,6 +552,7 @@ static inline void set_rx_sw_desc(struct rx_sw_desc *sd, struct page *pg, static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n, gfp_t gfp) { + struct sge *s = &adap->sge; struct page *pg; dma_addr_t mapping; unsigned int cred = q->avail; @@ -493,25 +561,27 @@ static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n, gfp |= __GFP_NOWARN | __GFP_COLD; -#if FL_PG_ORDER > 0 + if (s->fl_pg_order == 0) + goto alloc_small_pages; + /* * Prefer large buffers */ while (n) { - pg = alloc_pages(gfp | __GFP_COMP, FL_PG_ORDER); + pg = alloc_pages(gfp | __GFP_COMP, s->fl_pg_order); if (unlikely(!pg)) { q->large_alloc_failed++; break; /* fall back to single pages */ } mapping = dma_map_page(adap->pdev_dev, pg, 0, - PAGE_SIZE << FL_PG_ORDER, + PAGE_SIZE << s->fl_pg_order, PCI_DMA_FROMDEVICE); if (unlikely(dma_mapping_error(adap->pdev_dev, mapping))) { - __free_pages(pg, FL_PG_ORDER); + __free_pages(pg, s->fl_pg_order); goto out; /* do not try small pages for this error */ } - mapping |= RX_LARGE_BUF; + mapping |= RX_LARGE_PG_BUF; *d++ = cpu_to_be64(mapping); set_rx_sw_desc(sd, pg, mapping); @@ -525,10 +595,10 @@ static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n, } n--; } -#endif +alloc_small_pages: while (n--) { - pg = alloc_page(gfp); + pg = __skb_alloc_page(gfp, NULL); if (unlikely(!pg)) { q->alloc_failed++; break; @@ -646,11 +716,17 @@ static inline unsigned int flits_to_desc(unsigned int n) * @skb: the packet * * Returns whether an Ethernet packet is small enough to fit as - * immediate data. + * immediate data. Return value corresponds to headroom required. */ static inline int is_eth_imm(const struct sk_buff *skb) { - return skb->len <= MAX_IMM_TX_PKT_LEN - sizeof(struct cpl_tx_pkt); + int hdrlen = skb_shinfo(skb)->gso_size ? + sizeof(struct cpl_tx_pkt_lso_core) : 0; + + hdrlen += sizeof(struct cpl_tx_pkt); + if (skb->len <= MAX_IMM_TX_PKT_LEN - hdrlen) + return hdrlen; + return 0; } /** @@ -663,9 +739,10 @@ static inline int is_eth_imm(const struct sk_buff *skb) static inline unsigned int calc_tx_flits(const struct sk_buff *skb) { unsigned int flits; + int hdrlen = is_eth_imm(skb); - if (is_eth_imm(skb)) - return DIV_ROUND_UP(skb->len + sizeof(struct cpl_tx_pkt), 8); + if (hdrlen) + return DIV_ROUND_UP(skb->len + hdrlen, sizeof(__be64)); flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 4; if (skb_shinfo(skb)->gso_size) @@ -753,7 +830,23 @@ static void write_sgl(const struct sk_buff *skb, struct sge_txq *q, end = (void *)q->desc + part1; } if ((uintptr_t)end & 8) /* 0-pad to multiple of 16 */ - *(u64 *)end = 0; + *end = 0; +} + +/* This function copies 64 byte coalesced work request to + * memory mapped BAR2 space(user space writes). + * For coalesced WR SGE, fetches data from the FIFO instead of from Host. + */ +static void cxgb_pio_copy(u64 __iomem *dst, u64 *src) +{ + int count = 8; + + while (count) { + writeq(*src, dst); + src++; + dst++; + count--; + } } /** @@ -766,14 +859,30 @@ static void write_sgl(const struct sk_buff *skb, struct sge_txq *q, */ static inline void ring_tx_db(struct adapter *adap, struct sge_txq *q, int n) { + unsigned int *wr, index; + unsigned long flags; + wmb(); /* write descriptors before telling HW */ - spin_lock(&q->db_lock); + spin_lock_irqsave(&q->db_lock, flags); if (!q->db_disabled) { - t4_write_reg(adap, MYPF_REG(A_SGE_PF_KDOORBELL), - V_QID(q->cntxt_id) | V_PIDX(n)); - } + if (is_t4(adap->params.chip)) { + t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL), + QID(q->cntxt_id) | PIDX(n)); + } else { + if (n == 1) { + index = q->pidx ? (q->pidx - 1) : (q->size - 1); + wr = (unsigned int *)&q->desc[index]; + cxgb_pio_copy((u64 __iomem *) + (adap->bar2 + q->udb + 64), + (u64 *)wr); + } else + writel(n, adap->bar2 + q->udb + 8); + wmb(); + } + } else + q->db_pidx_inc += n; q->db_pidx = q->pidx; - spin_unlock(&q->db_lock); + spin_unlock_irqrestore(&q->db_lock, flags); } /** @@ -881,6 +990,7 @@ static inline void txq_advance(struct sge_txq *q, unsigned int n) */ netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev) { + int len; u32 wr_mid; u64 cntrl, *end; int qidx, credits; @@ -892,13 +1002,14 @@ netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev) struct cpl_tx_pkt_core *cpl; const struct skb_shared_info *ssi; dma_addr_t addr[MAX_SKB_FRAGS + 1]; + bool immediate = false; /* * The chip min packet length is 10 octets but play safe and reject * anything shorter than an Ethernet header. */ if (unlikely(skb->len < ETH_HLEN)) { -out_free: dev_kfree_skb(skb); +out_free: dev_kfree_skb_any(skb); return NETDEV_TX_OK; } @@ -921,7 +1032,10 @@ out_free: dev_kfree_skb(skb); return NETDEV_TX_BUSY; } - if (!is_eth_imm(skb) && + if (is_eth_imm(skb)) + immediate = true; + + if (!immediate && unlikely(map_skb(adap->pdev_dev, skb, addr) < 0)) { q->mapping_err++; goto out_free; @@ -938,6 +1052,7 @@ out_free: dev_kfree_skb(skb); wr->r3 = cpu_to_be64(0); end = (u64 *)wr + flits; + len = immediate ? skb->len : 0; ssi = skb_shinfo(skb); if (ssi->gso_size) { struct cpl_tx_pkt_lso *lso = (void *)wr; @@ -945,8 +1060,9 @@ out_free: dev_kfree_skb(skb); int l3hdr_len = skb_network_header_len(skb); int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN; + len += sizeof(*lso); wr->op_immdlen = htonl(FW_WR_OP(FW_ETH_TX_PKT_WR) | - FW_WR_IMMDLEN(sizeof(*lso))); + FW_WR_IMMDLEN(len)); lso->c.lso_ctrl = htonl(LSO_OPCODE(CPL_TX_PKT_LSO) | LSO_FIRST_SLICE | LSO_LAST_SLICE | LSO_IPV6(v6) | @@ -964,9 +1080,7 @@ out_free: dev_kfree_skb(skb); q->tso++; q->tx_cso += ssi->gso_segs; } else { - int len; - - len = is_eth_imm(skb) ? skb->len + sizeof(*cpl) : sizeof(*cpl); + len += sizeof(*cpl); wr->op_immdlen = htonl(FW_WR_OP(FW_ETH_TX_PKT_WR) | FW_WR_IMMDLEN(len)); cpl = (void *)(wr + 1); @@ -988,9 +1102,9 @@ out_free: dev_kfree_skb(skb); cpl->len = htons(skb->len); cpl->ctrl1 = cpu_to_be64(cntrl); - if (is_eth_imm(skb)) { + if (immediate) { inline_tx_skb(skb, &q->q, cpl + 1); - dev_kfree_skb(skb); + dev_consume_skb_any(skb); } else { int last_desc; @@ -1204,7 +1318,7 @@ static inline unsigned int calc_tx_flits_ofld(const struct sk_buff *skb) flits = skb_transport_offset(skb) / 8U; /* headers */ cnt = skb_shinfo(skb)->nr_frags; - if (skb->tail != skb->transport_header) + if (skb_tail_pointer(skb) != skb_transport_header(skb)) cnt++; return flits + sgl_len(cnt); } @@ -1377,8 +1491,12 @@ static inline int ofld_send(struct adapter *adap, struct sk_buff *skb) { unsigned int idx = skb_txq(skb); - if (unlikely(is_ctrl_pkt(skb))) + if (unlikely(is_ctrl_pkt(skb))) { + /* Single ctrl queue is a requirement for LE workaround path */ + if (adap->tids.nsftids) + idx = 0; return ctrl_xmit(&adap->sge.ctrlq[idx], skb); + } return ofld_xmit(&adap->sge.ofldtxq[idx], skb); } @@ -1499,7 +1617,6 @@ static noinline int handle_trace_pkt(struct adapter *adap, const struct pkt_gl *gl) { struct sk_buff *skb; - struct cpl_trace_pkt *p; skb = cxgb4_pktgl_to_skb(gl, RX_PULL_LEN, RX_PULL_LEN); if (unlikely(!skb)) { @@ -1507,8 +1624,11 @@ static noinline int handle_trace_pkt(struct adapter *adap, return 0; } - p = (struct cpl_trace_pkt *)skb->data; - __skb_pull(skb, sizeof(*p)); + if (is_t4(adap->params.chip)) + __skb_pull(skb, sizeof(struct cpl_trace_pkt)); + else + __skb_pull(skb, sizeof(struct cpl_t5_trace_pkt)); + skb_reset_mac_header(skb); skb->protocol = htons(0xffff); skb->dev = adap->port[0]; @@ -1519,6 +1639,8 @@ static noinline int handle_trace_pkt(struct adapter *adap, static void do_gro(struct sge_eth_rxq *rxq, const struct pkt_gl *gl, const struct cpl_rx_pkt *pkt) { + struct adapter *adapter = rxq->rspq.adap; + struct sge *s = &adapter->sge; int ret; struct sk_buff *skb; @@ -1529,17 +1651,18 @@ static void do_gro(struct sge_eth_rxq *rxq, const struct pkt_gl *gl, return; } - copy_frags(skb, gl, RX_PKT_PAD); - skb->len = gl->tot_len - RX_PKT_PAD; + copy_frags(skb, gl, s->pktshift); + skb->len = gl->tot_len - s->pktshift; skb->data_len = skb->len; skb->truesize += skb->data_len; skb->ip_summed = CHECKSUM_UNNECESSARY; skb_record_rx_queue(skb, rxq->rspq.idx); if (rxq->rspq.netdev->features & NETIF_F_RXHASH) - skb->rxhash = (__force u32)pkt->rsshdr.hash_val; + skb_set_hash(skb, (__force u32)pkt->rsshdr.hash_val, + PKT_HASH_TYPE_L3); if (unlikely(pkt->vlan_ex)) { - __vlan_hwaccel_put_tag(skb, ntohs(pkt->vlan)); + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan)); rxq->stats.vlan_ex++; } ret = napi_gro_frags(&rxq->rspq.napi); @@ -1566,12 +1689,16 @@ int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp, struct sk_buff *skb; const struct cpl_rx_pkt *pkt; struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq); + struct sge *s = &q->adap->sge; + int cpl_trace_pkt = is_t4(q->adap->params.chip) ? + CPL_TRACE_PKT : CPL_TRACE_PKT_T5; - if (unlikely(*(u8 *)rsp == CPL_TRACE_PKT)) + if (unlikely(*(u8 *)rsp == cpl_trace_pkt)) return handle_trace_pkt(q->adap, si); pkt = (const struct cpl_rx_pkt *)rsp; - csum_ok = pkt->csum_calc && !pkt->err_vec; + csum_ok = pkt->csum_calc && !pkt->err_vec && + (q->netdev->features & NETIF_F_RXCSUM); if ((pkt->l2info & htonl(RXF_TCP)) && (q->netdev->features & NETIF_F_GRO) && csum_ok && !pkt->ip_frag) { do_gro(rxq, si, pkt); @@ -1585,16 +1712,16 @@ int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp, return 0; } - __skb_pull(skb, RX_PKT_PAD); /* remove ethernet header padding */ + __skb_pull(skb, s->pktshift); /* remove ethernet header padding */ skb->protocol = eth_type_trans(skb, q->netdev); skb_record_rx_queue(skb, q->idx); if (skb->dev->features & NETIF_F_RXHASH) - skb->rxhash = (__force u32)pkt->rsshdr.hash_val; + skb_set_hash(skb, (__force u32)pkt->rsshdr.hash_val, + PKT_HASH_TYPE_L3); rxq->stats.pkts++; - if (csum_ok && (q->netdev->features & NETIF_F_RXCSUM) && - (pkt->l2info & htonl(RXF_UDP | RXF_TCP))) { + if (csum_ok && (pkt->l2info & htonl(RXF_UDP | RXF_TCP))) { if (!pkt->ip_frag) { skb->ip_summed = CHECKSUM_UNNECESSARY; rxq->stats.rx_cso++; @@ -1608,7 +1735,7 @@ int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp, skb_checksum_none_assert(skb); if (unlikely(pkt->vlan_ex)) { - __vlan_hwaccel_put_tag(skb, ntohs(pkt->vlan)); + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan)); rxq->stats.vlan_ex++; } netif_receive_skb(skb); @@ -1696,6 +1823,8 @@ static int process_responses(struct sge_rspq *q, int budget) int budget_left = budget; const struct rsp_ctrl *rc; struct sge_eth_rxq *rxq = container_of(q, struct sge_eth_rxq, rspq); + struct adapter *adapter = q->adap; + struct sge *s = &adapter->sge; while (likely(budget_left)) { rc = (void *)q->cur_desc + (q->iqe_len - sizeof(*rc)); @@ -1722,7 +1851,7 @@ static int process_responses(struct sge_rspq *q, int budget) /* gather packet fragments */ for (frags = 0, fp = si.frags; ; frags++, fp++) { rsd = &rxq->fl.sdesc[rxq->fl.cidx]; - bufsz = get_buf_size(rsd); + bufsz = get_buf_size(adapter, rsd); fp->page = rsd->page; fp->offset = q->offset; fp->size = min(bufsz, len); @@ -1747,7 +1876,7 @@ static int process_responses(struct sge_rspq *q, int budget) si.nfrags = frags + 1; ret = q->handler(q, q->cur_desc, &si); if (likely(ret == 0)) - q->offset += ALIGN(fp->size, FL_ALIGN); + q->offset += ALIGN(fp->size, s->fl_align); else restore_rx_bufs(&si, &rxq->fl, frags); } else if (likely(rsp_type == RSP_TYPE_CPL)) { @@ -1891,7 +2020,7 @@ irq_handler_t t4_intr_handler(struct adapter *adap) static void sge_rx_timer_cb(unsigned long data) { unsigned long m; - unsigned int i, cnt[2]; + unsigned int i, idma_same_state_cnt[2]; struct adapter *adap = (struct adapter *)data; struct sge *s = &adap->sge; @@ -1902,7 +2031,7 @@ static void sge_rx_timer_cb(unsigned long data) struct sge_fl *fl = s->egr_map[id]; clear_bit(id, s->starving_fl); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); if (fl_starving(fl)) { rxq = container_of(fl, struct sge_eth_rxq, fl); @@ -1914,21 +2043,64 @@ static void sge_rx_timer_cb(unsigned long data) } t4_write_reg(adap, SGE_DEBUG_INDEX, 13); - cnt[0] = t4_read_reg(adap, SGE_DEBUG_DATA_HIGH); - cnt[1] = t4_read_reg(adap, SGE_DEBUG_DATA_LOW); - - for (i = 0; i < 2; i++) - if (cnt[i] >= s->starve_thres) { - if (s->idma_state[i] || cnt[i] == 0xffffffff) - continue; - s->idma_state[i] = 1; - t4_write_reg(adap, SGE_DEBUG_INDEX, 11); - m = t4_read_reg(adap, SGE_DEBUG_DATA_LOW) >> (i * 16); - dev_warn(adap->pdev_dev, - "SGE idma%u starvation detected for " - "queue %lu\n", i, m & 0xffff); - } else if (s->idma_state[i]) - s->idma_state[i] = 0; + idma_same_state_cnt[0] = t4_read_reg(adap, SGE_DEBUG_DATA_HIGH); + idma_same_state_cnt[1] = t4_read_reg(adap, SGE_DEBUG_DATA_LOW); + + for (i = 0; i < 2; i++) { + u32 debug0, debug11; + + /* If the Ingress DMA Same State Counter ("timer") is less + * than 1s, then we can reset our synthesized Stall Timer and + * continue. If we have previously emitted warnings about a + * potential stalled Ingress Queue, issue a note indicating + * that the Ingress Queue has resumed forward progress. + */ + if (idma_same_state_cnt[i] < s->idma_1s_thresh) { + if (s->idma_stalled[i] >= SGE_IDMA_WARN_THRESH) + CH_WARN(adap, "SGE idma%d, queue%u,resumed after %d sec\n", + i, s->idma_qid[i], + s->idma_stalled[i]/HZ); + s->idma_stalled[i] = 0; + continue; + } + + /* Synthesize an SGE Ingress DMA Same State Timer in the Hz + * domain. The first time we get here it'll be because we + * passed the 1s Threshold; each additional time it'll be + * because the RX Timer Callback is being fired on its regular + * schedule. + * + * If the stall is below our Potential Hung Ingress Queue + * Warning Threshold, continue. + */ + if (s->idma_stalled[i] == 0) + s->idma_stalled[i] = HZ; + else + s->idma_stalled[i] += RX_QCHECK_PERIOD; + + if (s->idma_stalled[i] < SGE_IDMA_WARN_THRESH) + continue; + + /* We'll issue a warning every SGE_IDMA_WARN_REPEAT Hz */ + if (((s->idma_stalled[i] - HZ) % SGE_IDMA_WARN_REPEAT) != 0) + continue; + + /* Read and save the SGE IDMA State and Queue ID information. + * We do this every time in case it changes across time ... + */ + t4_write_reg(adap, SGE_DEBUG_INDEX, 0); + debug0 = t4_read_reg(adap, SGE_DEBUG_DATA_LOW); + s->idma_state[i] = (debug0 >> (i * 9)) & 0x3f; + + t4_write_reg(adap, SGE_DEBUG_INDEX, 11); + debug11 = t4_read_reg(adap, SGE_DEBUG_DATA_LOW); + s->idma_qid[i] = (debug11 >> (i * 16)) & 0xffff; + + CH_WARN(adap, "SGE idma%u, queue%u, maybe stuck state%u %dsecs (debug0=%#x, debug11=%#x)\n", + i, s->idma_qid[i], s->idma_state[i], + s->idma_stalled[i]/HZ, debug0, debug11); + t4_sge_decode_idma_state(adap, s->idma_state[i]); + } mod_timer(&s->rx_timer, jiffies + RX_QCHECK_PERIOD); } @@ -1983,6 +2155,7 @@ int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq, { int ret, flsz = 0; struct fw_iq_cmd c; + struct sge *s = &adap->sge; struct port_info *pi = netdev_priv(dev); /* Size needs to be multiple of 16, including status entry. */ @@ -2015,15 +2188,15 @@ int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq, fl->size = roundup(fl->size, 8); fl->desc = alloc_ring(adap->pdev_dev, fl->size, sizeof(__be64), sizeof(struct rx_sw_desc), &fl->addr, - &fl->sdesc, STAT_LEN, NUMA_NO_NODE); + &fl->sdesc, s->stat_len, NUMA_NO_NODE); if (!fl->desc) goto fl_nomem; - flsz = fl->size / 8 + STAT_LEN / sizeof(struct tx_desc); - c.iqns_to_fl0congen = htonl(FW_IQ_CMD_FL0PACKEN | + flsz = fl->size / 8 + s->stat_len / sizeof(struct tx_desc); + c.iqns_to_fl0congen = htonl(FW_IQ_CMD_FL0PACKEN(1) | FW_IQ_CMD_FL0FETCHRO(1) | FW_IQ_CMD_FL0DATARO(1) | - FW_IQ_CMD_FL0PADEN); + FW_IQ_CMD_FL0PADEN(1)); c.fl0dcaen_to_fl0cidxfthresh = htons(FW_IQ_CMD_FL0FBMIN(2) | FW_IQ_CMD_FL0FBMAX(3)); c.fl0size = htons(flsz); @@ -2042,7 +2215,6 @@ int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq, iq->cntxt_id = ntohs(c.iqid); iq->abs_id = ntohs(c.physiqid); iq->size--; /* subtract status entry */ - iq->adap = adap; iq->netdev = dev; iq->handler = hnd; @@ -2081,11 +2253,27 @@ err: static void init_txq(struct adapter *adap, struct sge_txq *q, unsigned int id) { + q->cntxt_id = id; + if (!is_t4(adap->params.chip)) { + unsigned int s_qpp; + unsigned short udb_density; + unsigned long qpshift; + int page; + + s_qpp = QUEUESPERPAGEPF1 * adap->fn; + udb_density = 1 << QUEUESPERPAGEPF0_GET((t4_read_reg(adap, + SGE_EGRESS_QUEUES_PER_PAGE_PF) >> s_qpp)); + qpshift = PAGE_SHIFT - ilog2(udb_density); + q->udb = q->cntxt_id << qpshift; + q->udb &= PAGE_MASK; + page = q->udb / PAGE_SIZE; + q->udb += (q->cntxt_id - (page * udb_density)) * 128; + } + q->in_use = 0; q->cidx = q->pidx = 0; q->stops = q->restarts = 0; q->stat = (void *)&q->desc[q->size]; - q->cntxt_id = id; spin_lock_init(&q->db_lock); adap->sge.egr_map[id - adap->sge.egr_start] = q; } @@ -2096,14 +2284,15 @@ int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq, { int ret, nentries; struct fw_eq_eth_cmd c; + struct sge *s = &adap->sge; struct port_info *pi = netdev_priv(dev); /* Add status entries */ - nentries = txq->q.size + STAT_LEN / sizeof(struct tx_desc); + nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc); txq->q.desc = alloc_ring(adap->pdev_dev, txq->q.size, sizeof(struct tx_desc), sizeof(struct tx_sw_desc), - &txq->q.phys_addr, &txq->q.sdesc, STAT_LEN, + &txq->q.phys_addr, &txq->q.sdesc, s->stat_len, netdev_queue_numa_node_read(netdevq)); if (!txq->q.desc) return -ENOMEM; @@ -2149,10 +2338,11 @@ int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq, { int ret, nentries; struct fw_eq_ctrl_cmd c; + struct sge *s = &adap->sge; struct port_info *pi = netdev_priv(dev); /* Add status entries */ - nentries = txq->q.size + STAT_LEN / sizeof(struct tx_desc); + nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc); txq->q.desc = alloc_ring(adap->pdev_dev, nentries, sizeof(struct tx_desc), 0, &txq->q.phys_addr, @@ -2200,14 +2390,15 @@ int t4_sge_alloc_ofld_txq(struct adapter *adap, struct sge_ofld_txq *txq, { int ret, nentries; struct fw_eq_ofld_cmd c; + struct sge *s = &adap->sge; struct port_info *pi = netdev_priv(dev); /* Add status entries */ - nentries = txq->q.size + STAT_LEN / sizeof(struct tx_desc); + nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc); txq->q.desc = alloc_ring(adap->pdev_dev, txq->q.size, sizeof(struct tx_desc), sizeof(struct tx_sw_desc), - &txq->q.phys_addr, &txq->q.sdesc, STAT_LEN, + &txq->q.phys_addr, &txq->q.sdesc, s->stat_len, NUMA_NO_NODE); if (!txq->q.desc) return -ENOMEM; @@ -2251,8 +2442,10 @@ int t4_sge_alloc_ofld_txq(struct adapter *adap, struct sge_ofld_txq *txq, static void free_txq(struct adapter *adap, struct sge_txq *q) { + struct sge *s = &adap->sge; + dma_free_coherent(adap->pdev_dev, - q->size * sizeof(struct tx_desc) + STAT_LEN, + q->size * sizeof(struct tx_desc) + s->stat_len, q->desc, q->phys_addr); q->cntxt_id = 0; q->sdesc = NULL; @@ -2262,6 +2455,7 @@ static void free_txq(struct adapter *adap, struct sge_txq *q) static void free_rspq_fl(struct adapter *adap, struct sge_rspq *rq, struct sge_fl *fl) { + struct sge *s = &adap->sge; unsigned int fl_id = fl ? fl->cntxt_id : 0xffff; adap->sge.ingr_map[rq->cntxt_id - adap->sge.ingr_start] = NULL; @@ -2276,7 +2470,7 @@ static void free_rspq_fl(struct adapter *adap, struct sge_rspq *rq, if (fl) { free_rx_bufs(adap, fl, fl->avail); - dma_free_coherent(adap->pdev_dev, fl->size * 8 + STAT_LEN, + dma_free_coherent(adap->pdev_dev, fl->size * 8 + s->stat_len, fl->desc, fl->addr); kfree(fl->sdesc); fl->sdesc = NULL; @@ -2320,6 +2514,10 @@ void t4_free_sge_resources(struct adapter *adap) if (oq->rspq.desc) free_rspq_fl(adap, &oq->rspq, &oq->fl); } + for (i = 0, oq = adap->sge.rdmaciq; i < adap->sge.rdmaciqs; i++, oq++) { + if (oq->rspq.desc) + free_rspq_fl(adap, &oq->rspq, &oq->fl); + } /* clean up offload Tx queues */ for (i = 0; i < ARRAY_SIZE(adap->sge.ofldtxq); i++) { @@ -2408,38 +2606,160 @@ void t4_sge_stop(struct adapter *adap) * Performs SGE initialization needed every time after a chip reset. * We do not initialize any of the queues here, instead the driver * top-level must request them individually. + * + * Called in two different modes: + * + * 1. Perform actual hardware initialization and record hard-coded + * parameters which were used. This gets used when we're the + * Master PF and the Firmware Configuration File support didn't + * work for some reason. + * + * 2. We're not the Master PF or initialization was performed with + * a Firmware Configuration File. In this case we need to grab + * any of the SGE operating parameters that we need to have in + * order to do our job and make sure we can live with them ... */ -void t4_sge_init(struct adapter *adap) + +static int t4_sge_init_soft(struct adapter *adap) +{ + struct sge *s = &adap->sge; + u32 fl_small_pg, fl_large_pg, fl_small_mtu, fl_large_mtu; + u32 timer_value_0_and_1, timer_value_2_and_3, timer_value_4_and_5; + u32 ingress_rx_threshold; + + /* + * Verify that CPL messages are going to the Ingress Queue for + * process_responses() and that only packet data is going to the + * Free Lists. + */ + if ((t4_read_reg(adap, SGE_CONTROL) & RXPKTCPLMODE_MASK) != + RXPKTCPLMODE(X_RXPKTCPLMODE_SPLIT)) { + dev_err(adap->pdev_dev, "bad SGE CPL MODE\n"); + return -EINVAL; + } + + /* + * Validate the Host Buffer Register Array indices that we want to + * use ... + * + * XXX Note that we should really read through the Host Buffer Size + * XXX register array and find the indices of the Buffer Sizes which + * XXX meet our needs! + */ + #define READ_FL_BUF(x) \ + t4_read_reg(adap, SGE_FL_BUFFER_SIZE0+(x)*sizeof(u32)) + + fl_small_pg = READ_FL_BUF(RX_SMALL_PG_BUF); + fl_large_pg = READ_FL_BUF(RX_LARGE_PG_BUF); + fl_small_mtu = READ_FL_BUF(RX_SMALL_MTU_BUF); + fl_large_mtu = READ_FL_BUF(RX_LARGE_MTU_BUF); + + /* We only bother using the Large Page logic if the Large Page Buffer + * is larger than our Page Size Buffer. + */ + if (fl_large_pg <= fl_small_pg) + fl_large_pg = 0; + + #undef READ_FL_BUF + + /* The Page Size Buffer must be exactly equal to our Page Size and the + * Large Page Size Buffer should be 0 (per above) or a power of 2. + */ + if (fl_small_pg != PAGE_SIZE || + (fl_large_pg & (fl_large_pg-1)) != 0) { + dev_err(adap->pdev_dev, "bad SGE FL page buffer sizes [%d, %d]\n", + fl_small_pg, fl_large_pg); + return -EINVAL; + } + if (fl_large_pg) + s->fl_pg_order = ilog2(fl_large_pg) - PAGE_SHIFT; + + if (fl_small_mtu < FL_MTU_SMALL_BUFSIZE(adap) || + fl_large_mtu < FL_MTU_LARGE_BUFSIZE(adap)) { + dev_err(adap->pdev_dev, "bad SGE FL MTU sizes [%d, %d]\n", + fl_small_mtu, fl_large_mtu); + return -EINVAL; + } + + /* + * Retrieve our RX interrupt holdoff timer values and counter + * threshold values from the SGE parameters. + */ + timer_value_0_and_1 = t4_read_reg(adap, SGE_TIMER_VALUE_0_AND_1); + timer_value_2_and_3 = t4_read_reg(adap, SGE_TIMER_VALUE_2_AND_3); + timer_value_4_and_5 = t4_read_reg(adap, SGE_TIMER_VALUE_4_AND_5); + s->timer_val[0] = core_ticks_to_us(adap, + TIMERVALUE0_GET(timer_value_0_and_1)); + s->timer_val[1] = core_ticks_to_us(adap, + TIMERVALUE1_GET(timer_value_0_and_1)); + s->timer_val[2] = core_ticks_to_us(adap, + TIMERVALUE2_GET(timer_value_2_and_3)); + s->timer_val[3] = core_ticks_to_us(adap, + TIMERVALUE3_GET(timer_value_2_and_3)); + s->timer_val[4] = core_ticks_to_us(adap, + TIMERVALUE4_GET(timer_value_4_and_5)); + s->timer_val[5] = core_ticks_to_us(adap, + TIMERVALUE5_GET(timer_value_4_and_5)); + + ingress_rx_threshold = t4_read_reg(adap, SGE_INGRESS_RX_THRESHOLD); + s->counter_val[0] = THRESHOLD_0_GET(ingress_rx_threshold); + s->counter_val[1] = THRESHOLD_1_GET(ingress_rx_threshold); + s->counter_val[2] = THRESHOLD_2_GET(ingress_rx_threshold); + s->counter_val[3] = THRESHOLD_3_GET(ingress_rx_threshold); + + return 0; +} + +static int t4_sge_init_hard(struct adapter *adap) { - unsigned int i, v; struct sge *s = &adap->sge; - unsigned int fl_align_log = ilog2(FL_ALIGN); - t4_set_reg_field(adap, SGE_CONTROL, PKTSHIFT_MASK | - INGPADBOUNDARY_MASK | EGRSTATUSPAGESIZE, - INGPADBOUNDARY(fl_align_log - 5) | PKTSHIFT(2) | - RXPKTCPLMODE | - (STAT_LEN == 128 ? EGRSTATUSPAGESIZE : 0)); + /* + * Set up our basic SGE mode to deliver CPL messages to our Ingress + * Queue and Packet Date to the Free List. + */ + t4_set_reg_field(adap, SGE_CONTROL, RXPKTCPLMODE_MASK, + RXPKTCPLMODE_MASK); /* * Set up to drop DOORBELL writes when the DOORBELL FIFO overflows * and generate an interrupt when this occurs so we can recover. */ - t4_set_reg_field(adap, A_SGE_DBFIFO_STATUS, - V_HP_INT_THRESH(M_HP_INT_THRESH) | - V_LP_INT_THRESH(M_LP_INT_THRESH), - V_HP_INT_THRESH(dbfifo_int_thresh) | - V_LP_INT_THRESH(dbfifo_int_thresh)); + if (is_t4(adap->params.chip)) { + t4_set_reg_field(adap, A_SGE_DBFIFO_STATUS, + V_HP_INT_THRESH(M_HP_INT_THRESH) | + V_LP_INT_THRESH(M_LP_INT_THRESH), + V_HP_INT_THRESH(dbfifo_int_thresh) | + V_LP_INT_THRESH(dbfifo_int_thresh)); + } else { + t4_set_reg_field(adap, A_SGE_DBFIFO_STATUS, + V_LP_INT_THRESH_T5(M_LP_INT_THRESH_T5), + V_LP_INT_THRESH_T5(dbfifo_int_thresh)); + t4_set_reg_field(adap, SGE_DBFIFO_STATUS2, + V_HP_INT_THRESH_T5(M_HP_INT_THRESH_T5), + V_HP_INT_THRESH_T5(dbfifo_int_thresh)); + } t4_set_reg_field(adap, A_SGE_DOORBELL_CONTROL, F_ENABLE_DROP, F_ENABLE_DROP); - for (i = v = 0; i < 32; i += 4) - v |= (PAGE_SHIFT - 10) << i; - t4_write_reg(adap, SGE_HOST_PAGE_SIZE, v); - t4_write_reg(adap, SGE_FL_BUFFER_SIZE0, PAGE_SIZE); -#if FL_PG_ORDER > 0 - t4_write_reg(adap, SGE_FL_BUFFER_SIZE1, PAGE_SIZE << FL_PG_ORDER); -#endif + /* + * SGE_FL_BUFFER_SIZE0 (RX_SMALL_PG_BUF) is set up by + * t4_fixup_host_params(). + */ + s->fl_pg_order = FL_PG_ORDER; + if (s->fl_pg_order) + t4_write_reg(adap, + SGE_FL_BUFFER_SIZE0+RX_LARGE_PG_BUF*sizeof(u32), + PAGE_SIZE << FL_PG_ORDER); + t4_write_reg(adap, SGE_FL_BUFFER_SIZE0+RX_SMALL_MTU_BUF*sizeof(u32), + FL_MTU_SMALL_BUFSIZE(adap)); + t4_write_reg(adap, SGE_FL_BUFFER_SIZE0+RX_LARGE_MTU_BUF*sizeof(u32), + FL_MTU_LARGE_BUFSIZE(adap)); + + /* + * Note that the SGE Ingress Packet Count Interrupt Threshold and + * Timer Holdoff values must be supplied by our caller. + */ t4_write_reg(adap, SGE_INGRESS_RX_THRESHOLD, THRESHOLD_0(s->counter_val[0]) | THRESHOLD_1(s->counter_val[1]) | @@ -2449,14 +2769,63 @@ void t4_sge_init(struct adapter *adap) TIMERVALUE0(us_to_core_ticks(adap, s->timer_val[0])) | TIMERVALUE1(us_to_core_ticks(adap, s->timer_val[1]))); t4_write_reg(adap, SGE_TIMER_VALUE_2_AND_3, - TIMERVALUE0(us_to_core_ticks(adap, s->timer_val[2])) | - TIMERVALUE1(us_to_core_ticks(adap, s->timer_val[3]))); + TIMERVALUE2(us_to_core_ticks(adap, s->timer_val[2])) | + TIMERVALUE3(us_to_core_ticks(adap, s->timer_val[3]))); t4_write_reg(adap, SGE_TIMER_VALUE_4_AND_5, - TIMERVALUE0(us_to_core_ticks(adap, s->timer_val[4])) | - TIMERVALUE1(us_to_core_ticks(adap, s->timer_val[5]))); + TIMERVALUE4(us_to_core_ticks(adap, s->timer_val[4])) | + TIMERVALUE5(us_to_core_ticks(adap, s->timer_val[5]))); + + return 0; +} + +int t4_sge_init(struct adapter *adap) +{ + struct sge *s = &adap->sge; + u32 sge_control, sge_conm_ctrl; + int ret, egress_threshold; + + /* + * Ingress Padding Boundary and Egress Status Page Size are set up by + * t4_fixup_host_params(). + */ + sge_control = t4_read_reg(adap, SGE_CONTROL); + s->pktshift = PKTSHIFT_GET(sge_control); + s->stat_len = (sge_control & EGRSTATUSPAGESIZE_MASK) ? 128 : 64; + s->fl_align = 1 << (INGPADBOUNDARY_GET(sge_control) + + X_INGPADBOUNDARY_SHIFT); + + if (adap->flags & USING_SOFT_PARAMS) + ret = t4_sge_init_soft(adap); + else + ret = t4_sge_init_hard(adap); + if (ret < 0) + return ret; + + /* + * A FL with <= fl_starve_thres buffers is starving and a periodic + * timer will attempt to refill it. This needs to be larger than the + * SGE's Egress Congestion Threshold. If it isn't, then we can get + * stuck waiting for new packets while the SGE is waiting for us to + * give it more Free List entries. (Note that the SGE's Egress + * Congestion Threshold is in units of 2 Free List pointers.) For T4, + * there was only a single field to control this. For T5 there's the + * original field which now only applies to Unpacked Mode Free List + * buffers and a new field which only applies to Packed Mode Free List + * buffers. + */ + sge_conm_ctrl = t4_read_reg(adap, SGE_CONM_CTRL); + if (is_t4(adap->params.chip)) + egress_threshold = EGRTHRESHOLD_GET(sge_conm_ctrl); + else + egress_threshold = EGRTHRESHOLDPACKING_GET(sge_conm_ctrl); + s->fl_starve_thres = 2*egress_threshold + 1; + setup_timer(&s->rx_timer, sge_rx_timer_cb, (unsigned long)adap); setup_timer(&s->tx_timer, sge_tx_timer_cb, (unsigned long)adap); - s->starve_thres = core_ticks_per_usec(adap) * 1000000; /* 1 s */ - s->idma_state[0] = s->idma_state[1] = 0; + s->idma_1s_thresh = core_ticks_per_usec(adap) * 1000000; /* 1 s */ + s->idma_stalled[0] = 0; + s->idma_stalled[1] = 0; spin_lock_init(&s->intrq_lock); + + return 0; } |
