diff options
Diffstat (limited to 'drivers/usb/musb/cppi_dma.c')
-rw-r--r-- | drivers/usb/musb/cppi_dma.c | 1540 |
1 files changed, 1540 insertions, 0 deletions
diff --git a/drivers/usb/musb/cppi_dma.c b/drivers/usb/musb/cppi_dma.c new file mode 100644 index 00000000000..5ad6d0893cb --- /dev/null +++ b/drivers/usb/musb/cppi_dma.c @@ -0,0 +1,1540 @@ +/* + * Copyright (C) 2005-2006 by Texas Instruments + * + * This file implements a DMA interface using TI's CPPI DMA. + * For now it's DaVinci-only, but CPPI isn't specific to DaVinci or USB. + * The TUSB6020, using VLYNQ, has CPPI that looks much like DaVinci. + */ + +#include <linux/usb.h> + +#include "musb_core.h" +#include "cppi_dma.h" + + +/* CPPI DMA status 7-mar-2006: + * + * - See musb_{host,gadget}.c for more info + * + * - Correct RX DMA generally forces the engine into irq-per-packet mode, + * which can easily saturate the CPU under non-mass-storage loads. + * + * NOTES 24-aug-2006 (2.6.18-rc4): + * + * - peripheral RXDMA wedged in a test with packets of length 512/512/1. + * evidently after the 1 byte packet was received and acked, the queue + * of BDs got garbaged so it wouldn't empty the fifo. (rxcsr 0x2003, + * and RX DMA0: 4 left, 80000000 8feff880, 8feff860 8feff860; 8f321401 + * 004001ff 00000001 .. 8feff860) Host was just getting NAKed on tx + * of its next (512 byte) packet. IRQ issues? + * + * REVISIT: the "transfer DMA" glue between CPPI and USB fifos will + * evidently also directly update the RX and TX CSRs ... so audit all + * host and peripheral side DMA code to avoid CSR access after DMA has + * been started. + */ + +/* REVISIT now we can avoid preallocating these descriptors; or + * more simply, switch to a global freelist not per-channel ones. + * Note: at full speed, 64 descriptors == 4K bulk data. + */ +#define NUM_TXCHAN_BD 64 +#define NUM_RXCHAN_BD 64 + +static inline void cpu_drain_writebuffer(void) +{ + wmb(); +#ifdef CONFIG_CPU_ARM926T + /* REVISIT this "should not be needed", + * but lack of it sure seemed to hurt ... + */ + asm("mcr p15, 0, r0, c7, c10, 4 @ drain write buffer\n"); +#endif +} + +static inline struct cppi_descriptor *cppi_bd_alloc(struct cppi_channel *c) +{ + struct cppi_descriptor *bd = c->freelist; + + if (bd) + c->freelist = bd->next; + return bd; +} + +static inline void +cppi_bd_free(struct cppi_channel *c, struct cppi_descriptor *bd) +{ + if (!bd) + return; + bd->next = c->freelist; + c->freelist = bd; +} + +/* + * Start DMA controller + * + * Initialize the DMA controller as necessary. + */ + +/* zero out entire rx state RAM entry for the channel */ +static void cppi_reset_rx(struct cppi_rx_stateram __iomem *rx) +{ + musb_writel(&rx->rx_skipbytes, 0, 0); + musb_writel(&rx->rx_head, 0, 0); + musb_writel(&rx->rx_sop, 0, 0); + musb_writel(&rx->rx_current, 0, 0); + musb_writel(&rx->rx_buf_current, 0, 0); + musb_writel(&rx->rx_len_len, 0, 0); + musb_writel(&rx->rx_cnt_cnt, 0, 0); +} + +/* zero out entire tx state RAM entry for the channel */ +static void cppi_reset_tx(struct cppi_tx_stateram __iomem *tx, u32 ptr) +{ + musb_writel(&tx->tx_head, 0, 0); + musb_writel(&tx->tx_buf, 0, 0); + musb_writel(&tx->tx_current, 0, 0); + musb_writel(&tx->tx_buf_current, 0, 0); + musb_writel(&tx->tx_info, 0, 0); + musb_writel(&tx->tx_rem_len, 0, 0); + /* musb_writel(&tx->tx_dummy, 0, 0); */ + musb_writel(&tx->tx_complete, 0, ptr); +} + +static void __init cppi_pool_init(struct cppi *cppi, struct cppi_channel *c) +{ + int j; + + /* initialize channel fields */ + c->head = NULL; + c->tail = NULL; + c->last_processed = NULL; + c->channel.status = MUSB_DMA_STATUS_UNKNOWN; + c->controller = cppi; + c->is_rndis = 0; + c->freelist = NULL; + + /* build the BD Free list for the channel */ + for (j = 0; j < NUM_TXCHAN_BD + 1; j++) { + struct cppi_descriptor *bd; + dma_addr_t dma; + + bd = dma_pool_alloc(cppi->pool, GFP_KERNEL, &dma); + bd->dma = dma; + cppi_bd_free(c, bd); + } +} + +static int cppi_channel_abort(struct dma_channel *); + +static void cppi_pool_free(struct cppi_channel *c) +{ + struct cppi *cppi = c->controller; + struct cppi_descriptor *bd; + + (void) cppi_channel_abort(&c->channel); + c->channel.status = MUSB_DMA_STATUS_UNKNOWN; + c->controller = NULL; + + /* free all its bds */ + bd = c->last_processed; + do { + if (bd) + dma_pool_free(cppi->pool, bd, bd->dma); + bd = cppi_bd_alloc(c); + } while (bd); + c->last_processed = NULL; +} + +static int __init cppi_controller_start(struct dma_controller *c) +{ + struct cppi *controller; + void __iomem *tibase; + int i; + + controller = container_of(c, struct cppi, controller); + + /* do whatever is necessary to start controller */ + for (i = 0; i < ARRAY_SIZE(controller->tx); i++) { + controller->tx[i].transmit = true; + controller->tx[i].index = i; + } + for (i = 0; i < ARRAY_SIZE(controller->rx); i++) { + controller->rx[i].transmit = false; + controller->rx[i].index = i; + } + + /* setup BD list on a per channel basis */ + for (i = 0; i < ARRAY_SIZE(controller->tx); i++) + cppi_pool_init(controller, controller->tx + i); + for (i = 0; i < ARRAY_SIZE(controller->rx); i++) + cppi_pool_init(controller, controller->rx + i); + + tibase = controller->tibase; + INIT_LIST_HEAD(&controller->tx_complete); + + /* initialise tx/rx channel head pointers to zero */ + for (i = 0; i < ARRAY_SIZE(controller->tx); i++) { + struct cppi_channel *tx_ch = controller->tx + i; + struct cppi_tx_stateram __iomem *tx; + + INIT_LIST_HEAD(&tx_ch->tx_complete); + + tx = tibase + DAVINCI_TXCPPI_STATERAM_OFFSET(i); + tx_ch->state_ram = tx; + cppi_reset_tx(tx, 0); + } + for (i = 0; i < ARRAY_SIZE(controller->rx); i++) { + struct cppi_channel *rx_ch = controller->rx + i; + struct cppi_rx_stateram __iomem *rx; + + INIT_LIST_HEAD(&rx_ch->tx_complete); + + rx = tibase + DAVINCI_RXCPPI_STATERAM_OFFSET(i); + rx_ch->state_ram = rx; + cppi_reset_rx(rx); + } + + /* enable individual cppi channels */ + musb_writel(tibase, DAVINCI_TXCPPI_INTENAB_REG, + DAVINCI_DMA_ALL_CHANNELS_ENABLE); + musb_writel(tibase, DAVINCI_RXCPPI_INTENAB_REG, + DAVINCI_DMA_ALL_CHANNELS_ENABLE); + + /* enable tx/rx CPPI control */ + musb_writel(tibase, DAVINCI_TXCPPI_CTRL_REG, DAVINCI_DMA_CTRL_ENABLE); + musb_writel(tibase, DAVINCI_RXCPPI_CTRL_REG, DAVINCI_DMA_CTRL_ENABLE); + + /* disable RNDIS mode, also host rx RNDIS autorequest */ + musb_writel(tibase, DAVINCI_RNDIS_REG, 0); + musb_writel(tibase, DAVINCI_AUTOREQ_REG, 0); + + return 0; +} + +/* + * Stop DMA controller + * + * De-Init the DMA controller as necessary. + */ + +static int cppi_controller_stop(struct dma_controller *c) +{ + struct cppi *controller; + void __iomem *tibase; + int i; + + controller = container_of(c, struct cppi, controller); + + tibase = controller->tibase; + /* DISABLE INDIVIDUAL CHANNEL Interrupts */ + musb_writel(tibase, DAVINCI_TXCPPI_INTCLR_REG, + DAVINCI_DMA_ALL_CHANNELS_ENABLE); + musb_writel(tibase, DAVINCI_RXCPPI_INTCLR_REG, + DAVINCI_DMA_ALL_CHANNELS_ENABLE); + + DBG(1, "Tearing down RX and TX Channels\n"); + for (i = 0; i < ARRAY_SIZE(controller->tx); i++) { + /* FIXME restructure of txdma to use bds like rxdma */ + controller->tx[i].last_processed = NULL; + cppi_pool_free(controller->tx + i); + } + for (i = 0; i < ARRAY_SIZE(controller->rx); i++) + cppi_pool_free(controller->rx + i); + + /* in Tx Case proper teardown is supported. We resort to disabling + * Tx/Rx CPPI after cleanup of Tx channels. Before TX teardown is + * complete TX CPPI cannot be disabled. + */ + /*disable tx/rx cppi */ + musb_writel(tibase, DAVINCI_TXCPPI_CTRL_REG, DAVINCI_DMA_CTRL_DISABLE); + musb_writel(tibase, DAVINCI_RXCPPI_CTRL_REG, DAVINCI_DMA_CTRL_DISABLE); + + return 0; +} + +/* While dma channel is allocated, we only want the core irqs active + * for fault reports, otherwise we'd get irqs that we don't care about. + * Except for TX irqs, where dma done != fifo empty and reusable ... + * + * NOTE: docs don't say either way, but irq masking **enables** irqs. + * + * REVISIT same issue applies to pure PIO usage too, and non-cppi dma... + */ +static inline void core_rxirq_disable(void __iomem *tibase, unsigned epnum) +{ + musb_writel(tibase, DAVINCI_USB_INT_MASK_CLR_REG, 1 << (epnum + 8)); +} + +static inline void core_rxirq_enable(void __iomem *tibase, unsigned epnum) +{ + musb_writel(tibase, DAVINCI_USB_INT_MASK_SET_REG, 1 << (epnum + 8)); +} + + +/* + * Allocate a CPPI Channel for DMA. With CPPI, channels are bound to + * each transfer direction of a non-control endpoint, so allocating + * (and deallocating) is mostly a way to notice bad housekeeping on + * the software side. We assume the irqs are always active. + */ +static struct dma_channel * +cppi_channel_allocate(struct dma_controller *c, + struct musb_hw_ep *ep, u8 transmit) +{ + struct cppi *controller; + u8 index; + struct cppi_channel *cppi_ch; + void __iomem *tibase; + + controller = container_of(c, struct cppi, controller); + tibase = controller->tibase; + + /* ep0 doesn't use DMA; remember cppi indices are 0..N-1 */ + index = ep->epnum - 1; + + /* return the corresponding CPPI Channel Handle, and + * probably disable the non-CPPI irq until we need it. + */ + if (transmit) { + if (index >= ARRAY_SIZE(controller->tx)) { + DBG(1, "no %cX%d CPPI channel\n", 'T', index); + return NULL; + } + cppi_ch = controller->tx + index; + } else { + if (index >= ARRAY_SIZE(controller->rx)) { + DBG(1, "no %cX%d CPPI channel\n", 'R', index); + return NULL; + } + cppi_ch = controller->rx + index; + core_rxirq_disable(tibase, ep->epnum); + } + + /* REVISIT make this an error later once the same driver code works + * with the other DMA engine too + */ + if (cppi_ch->hw_ep) + DBG(1, "re-allocating DMA%d %cX channel %p\n", + index, transmit ? 'T' : 'R', cppi_ch); + cppi_ch->hw_ep = ep; + cppi_ch->channel.status = MUSB_DMA_STATUS_FREE; + + DBG(4, "Allocate CPPI%d %cX\n", index, transmit ? 'T' : 'R'); + return &cppi_ch->channel; +} + +/* Release a CPPI Channel. */ +static void cppi_channel_release(struct dma_channel *channel) +{ + struct cppi_channel *c; + void __iomem *tibase; + + /* REVISIT: for paranoia, check state and abort if needed... */ + + c = container_of(channel, struct cppi_channel, channel); + tibase = c->controller->tibase; + if (!c->hw_ep) + DBG(1, "releasing idle DMA channel %p\n", c); + else if (!c->transmit) + core_rxirq_enable(tibase, c->index + 1); + + /* for now, leave its cppi IRQ enabled (we won't trigger it) */ + c->hw_ep = NULL; + channel->status = MUSB_DMA_STATUS_UNKNOWN; +} + +/* Context: controller irqlocked */ +static void +cppi_dump_rx(int level, struct cppi_channel *c, const char *tag) +{ + void __iomem *base = c->controller->mregs; + struct cppi_rx_stateram __iomem *rx = c->state_ram; + + musb_ep_select(base, c->index + 1); + + DBG(level, "RX DMA%d%s: %d left, csr %04x, " + "%08x H%08x S%08x C%08x, " + "B%08x L%08x %08x .. %08x" + "\n", + c->index, tag, + musb_readl(c->controller->tibase, + DAVINCI_RXCPPI_BUFCNT0_REG + 4 * c->index), + musb_readw(c->hw_ep->regs, MUSB_RXCSR), + + musb_readl(&rx->rx_skipbytes, 0), + musb_readl(&rx->rx_head, 0), + musb_readl(&rx->rx_sop, 0), + musb_readl(&rx->rx_current, 0), + + musb_readl(&rx->rx_buf_current, 0), + musb_readl(&rx->rx_len_len, 0), + musb_readl(&rx->rx_cnt_cnt, 0), + musb_readl(&rx->rx_complete, 0) + ); +} + +/* Context: controller irqlocked */ +static void +cppi_dump_tx(int level, struct cppi_channel *c, const char *tag) +{ + void __iomem *base = c->controller->mregs; + struct cppi_tx_stateram __iomem *tx = c->state_ram; + + musb_ep_select(base, c->index + 1); + + DBG(level, "TX DMA%d%s: csr %04x, " + "H%08x S%08x C%08x %08x, " + "F%08x L%08x .. %08x" + "\n", + c->index, tag, + musb_readw(c->hw_ep->regs, MUSB_TXCSR), + + musb_readl(&tx->tx_head, 0), + musb_readl(&tx->tx_buf, 0), + musb_readl(&tx->tx_current, 0), + musb_readl(&tx->tx_buf_current, 0), + + musb_readl(&tx->tx_info, 0), + musb_readl(&tx->tx_rem_len, 0), + /* dummy/unused word 6 */ + musb_readl(&tx->tx_complete, 0) + ); +} + +/* Context: controller irqlocked */ +static inline void +cppi_rndis_update(struct cppi_channel *c, int is_rx, + void __iomem *tibase, int is_rndis) +{ + /* we may need to change the rndis flag for this cppi channel */ + if (c->is_rndis != is_rndis) { + u32 value = musb_readl(tibase, DAVINCI_RNDIS_REG); + u32 temp = 1 << (c->index); + + if (is_rx) + temp <<= 16; + if (is_rndis) + value |= temp; + else + value &= ~temp; + musb_writel(tibase, DAVINCI_RNDIS_REG, value); + c->is_rndis = is_rndis; + } +} + +static void cppi_dump_rxbd(const char *tag, struct cppi_descriptor *bd) +{ + pr_debug("RXBD/%s %08x: " + "nxt %08x buf %08x off.blen %08x opt.plen %08x\n", + tag, bd->dma, + bd->hw_next, bd->hw_bufp, bd->hw_off_len, + bd->hw_options); +} + +static void cppi_dump_rxq(int level, const char *tag, struct cppi_channel *rx) +{ +#if MUSB_DEBUG > 0 + struct cppi_descriptor *bd; + + if (!_dbg_level(level)) + return; + cppi_dump_rx(level, rx, tag); + if (rx->last_processed) + cppi_dump_rxbd("last", rx->last_processed); + for (bd = rx->head; bd; bd = bd->next) + cppi_dump_rxbd("active", bd); +#endif +} + + +/* NOTE: DaVinci autoreq is ignored except for host side "RNDIS" mode RX; + * so we won't ever use it (see "CPPI RX Woes" below). + */ +static inline int cppi_autoreq_update(struct cppi_channel *rx, + void __iomem *tibase, int onepacket, unsigned n_bds) +{ + u32 val; + +#ifdef RNDIS_RX_IS_USABLE + u32 tmp; + /* assert(is_host_active(musb)) */ + + /* start from "AutoReq never" */ + tmp = musb_readl(tibase, DAVINCI_AUTOREQ_REG); + val = tmp & ~((0x3) << (rx->index * 2)); + + /* HCD arranged reqpkt for packet #1. we arrange int + * for all but the last one, maybe in two segments. + */ + if (!onepacket) { +#if 0 + /* use two segments, autoreq "all" then the last "never" */ + val |= ((0x3) << (rx->index * 2)); + n_bds--; +#else + /* one segment, autoreq "all-but-last" */ + val |= ((0x1) << (rx->index * 2)); +#endif + } + + if (val != tmp) { + int n = 100; + + /* make sure that autoreq is updated before continuing */ + musb_writel(tibase, DAVINCI_AUTOREQ_REG, val); + do { + tmp = musb_readl(tibase, DAVINCI_AUTOREQ_REG); + if (tmp == val) + break; + cpu_relax(); + } while (n-- > 0); + } +#endif + + /* REQPKT is turned off after each segment */ + if (n_bds && rx->channel.actual_len) { + void __iomem *regs = rx->hw_ep->regs; + + val = musb_readw(regs, MUSB_RXCSR); + if (!(val & MUSB_RXCSR_H_REQPKT)) { + val |= MUSB_RXCSR_H_REQPKT | MUSB_RXCSR_H_WZC_BITS; + musb_writew(regs, MUSB_RXCSR, val); + /* flush writebufer */ + val = musb_readw(regs, MUSB_RXCSR); + } + } + return n_bds; +} + + +/* Buffer enqueuing Logic: + * + * - RX builds new queues each time, to help handle routine "early + * termination" cases (faults, including errors and short reads) + * more correctly. + * + * - for now, TX reuses the same queue of BDs every time + * + * REVISIT long term, we want a normal dynamic model. + * ... the goal will be to append to the + * existing queue, processing completed "dma buffers" (segments) on the fly. + * + * Otherwise we force an IRQ latency between requests, which slows us a lot + * (especially in "transparent" dma). Unfortunately that model seems to be + * inherent in the DMA model from the Mentor code, except in the rare case + * of transfers big enough (~128+ KB) that we could append "middle" segments + * in the TX paths. (RX can't do this, see below.) + * + * That's true even in the CPPI- friendly iso case, where most urbs have + * several small segments provided in a group and where the "packet at a time" + * "transparent" DMA model is always correct, even on the RX side. + */ + +/* + * CPPI TX: + * ======== + * TX is a lot more reasonable than RX; it doesn't need to run in + * irq-per-packet mode very often. RNDIS mode seems to behave too + * (except how it handles the exactly-N-packets case). Building a + * txdma queue with multiple requests (urb or usb_request) looks + * like it would work ... but fault handling would need much testing. + * + * The main issue with TX mode RNDIS relates to transfer lengths that + * are an exact multiple of the packet length. It appears that there's + * a hiccup in that case (maybe the DMA completes before the ZLP gets + * written?) boiling down to not being able to rely on CPPI writing any + * terminating zero length packet before the next transfer is written. + * So that's punted to PIO; better yet, gadget drivers can avoid it. + * + * Plus, there's allegedly an undocumented constraint that rndis transfer + * length be a multiple of 64 bytes ... but the chip doesn't act that + * way, and we really don't _want_ that behavior anyway. + * + * On TX, "transparent" mode works ... although experiments have shown + * problems trying to use the SOP/EOP bits in different USB packets. + * + * REVISIT try to handle terminating zero length packets using CPPI + * instead of doing it by PIO after an IRQ. (Meanwhile, make Ethernet + * links avoid that issue by forcing them to avoid zlps.) + */ +static void +cppi_next_tx_segment(struct musb *musb, struct cppi_channel *tx) +{ + unsigned maxpacket = tx->maxpacket; + dma_addr_t addr = tx->buf_dma + tx->offset; + size_t length = tx->buf_len - tx->offset; + struct cppi_descriptor *bd; + unsigned n_bds; + unsigned i; + struct cppi_tx_stateram __iomem *tx_ram = tx->state_ram; + int rndis; + + /* TX can use the CPPI "rndis" mode, where we can probably fit this + * transfer in one BD and one IRQ. The only time we would NOT want + * to use it is when hardware constraints prevent it, or if we'd + * trigger the "send a ZLP?" confusion. + */ + rndis = (maxpacket & 0x3f) == 0 + && length < 0xffff + && (length % maxpacket) != 0; + + if (rndis) { + maxpacket = length; + n_bds = 1; + } else { + n_bds = length / maxpacket; + if (!length || (length % maxpacket)) + n_bds++; + n_bds = min(n_bds, (unsigned) NUM_TXCHAN_BD); + length = min(n_bds * maxpacket, length); + } + + DBG(4, "TX DMA%d, pktSz %d %s bds %d dma 0x%x len %u\n", + tx->index, + maxpacket, + rndis ? "rndis" : "transparent", + n_bds, + addr, length); + + cppi_rndis_update(tx, 0, musb->ctrl_base, rndis); + + /* assuming here that channel_program is called during + * transfer initiation ... current code maintains state + * for one outstanding request only (no queues, not even + * the implicit ones of an iso urb). + */ + + bd = tx->freelist; + tx->head = bd; + tx->last_processed = NULL; + + /* FIXME use BD pool like RX side does, and just queue + * the minimum number for this request. + */ + + /* Prepare queue of BDs first, then hand it to hardware. + * All BDs except maybe the last should be of full packet + * size; for RNDIS there _is_ only that last packet. + */ + for (i = 0; i < n_bds; ) { + if (++i < n_bds && bd->next) + bd->hw_next = bd->next->dma; + else + bd->hw_next = 0; + + bd->hw_bufp = tx->buf_dma + tx->offset; + + /* FIXME set EOP only on the last packet, + * SOP only on the first ... avoid IRQs + */ + if ((tx->offset + maxpacket) <= tx->buf_len) { + tx->offset += maxpacket; + bd->hw_off_len = maxpacket; + bd->hw_options = CPPI_SOP_SET | CPPI_EOP_SET + | CPPI_OWN_SET | maxpacket; + } else { + /* only this one may be a partial USB Packet */ + u32 partial_len; + + partial_len = tx->buf_len - tx->offset; + tx->offset = tx->buf_len; + bd->hw_off_len = partial_len; + + bd->hw_options = CPPI_SOP_SET | CPPI_EOP_SET + | CPPI_OWN_SET | partial_len; + if (partial_len == 0) + bd->hw_options |= CPPI_ZERO_SET; + } + + DBG(5, "TXBD %p: nxt %08x buf %08x len %04x opt %08x\n", + bd, bd->hw_next, bd->hw_bufp, + bd->hw_off_len, bd->hw_options); + + /* update the last BD enqueued to the list */ + tx->tail = bd; + bd = bd->next; + } + + /* BDs live in DMA-coherent memory, but writes might be pending */ + cpu_drain_writebuffer(); + + /* Write to the HeadPtr in state RAM to trigger */ + musb_writel(&tx_ram->tx_head, 0, (u32)tx->freelist->dma); + + cppi_dump_tx(5, tx, "/S"); +} + +/* + * CPPI RX Woes: + * ============= + * Consider a 1KB bulk RX buffer in two scenarios: (a) it's fed two 300 byte + * packets back-to-back, and (b) it's fed two 512 byte packets back-to-back. + * (Full speed transfers have similar scenarios.) + * + * The correct behavior for Linux is that (a) fills the buffer with 300 bytes, + * and the next packet goes into a buffer that's queued later; while (b) fills + * the buffer with 1024 bytes. How to do that with CPPI? + * + * - RX queues in "rndis" mode -- one single BD -- handle (a) correctly, but + * (b) loses **BADLY** because nothing (!) happens when that second packet + * fills the buffer, much less when a third one arrives. (Which makes this + * not a "true" RNDIS mode. In the RNDIS protocol short-packet termination + * is optional, and it's fine if peripherals -- not hosts! -- pad messages + * out to end-of-buffer. Standard PCI host controller DMA descriptors + * implement that mode by default ... which is no accident.) + * + * - RX queues in "transparent" mode -- two BDs with 512 bytes each -- have + * converse problems: (b) is handled right, but (a) loses badly. CPPI RX + * ignores SOP/EOP markings and processes both of those BDs; so both packets + * are loaded into the buffer (with a 212 byte gap between them), and the next + * buffer queued will NOT get its 300 bytes of data. (It seems like SOP/EOP + * are intended as outputs for RX queues, not inputs...) + * + * - A variant of "transparent" mode -- one BD at a time -- is the only way to + * reliably make both cases work, with software handling both cases correctly + * and at the significant penalty of needing an IRQ per packet. (The lack of + * I/O overlap can be slightly ameliorated by enabling double buffering.) + * + * So how to get rid of IRQ-per-packet? The transparent multi-BD case could + * be used in special cases like mass storage, which sets URB_SHORT_NOT_OK + * (or maybe its peripheral side counterpart) to flag (a) scenarios as errors + * with guaranteed driver level fault recovery and scrubbing out what's left + * of that garbaged datastream. + * + * But there seems to be no way to identify the cases where CPPI RNDIS mode + * is appropriate -- which do NOT include RNDIS host drivers, but do include + * the CDC Ethernet driver! -- and the documentation is incomplete/wrong. + * So we can't _ever_ use RX RNDIS mode ... except by using a heuristic + * that applies best on the peripheral side (and which could fail rudely). + * + * Leaving only "transparent" mode; we avoid multi-bd modes in almost all + * cases other than mass storage class. Otherwise we're correct but slow, + * since CPPI penalizes our need for a "true RNDIS" default mode. + */ + + +/* Heuristic, intended to kick in for ethernet/rndis peripheral ONLY + * + * IFF + * (a) peripheral mode ... since rndis peripherals could pad their + * writes to hosts, causing i/o failure; or we'd have to cope with + * a largely unknowable variety of host side protocol variants + * (b) and short reads are NOT errors ... since full reads would + * cause those same i/o failures + * (c) and read length is + * - less than 64KB (max per cppi descriptor) + * - not a multiple of 4096 (g_zero default, full reads typical) + * - N (>1) packets long, ditto (full reads not EXPECTED) + * THEN + * try rx rndis mode + * + * Cost of heuristic failing: RXDMA wedges at the end of transfers that + * fill out the whole buffer. Buggy host side usb network drivers could + * trigger that, but "in the field" such bugs seem to be all but unknown. + * + * So this module parameter lets the heuristic be disabled. When using + * gadgetfs, the heuristic will probably need to be disabled. + */ +static int cppi_rx_rndis = 1; + +module_param(cppi_rx_rndis, bool, 0); +MODULE_PARM_DESC(cppi_rx_rndis, "enable/disable RX RNDIS heuristic"); + + +/** + * cppi_next_rx_segment - dma read for the next chunk of a buffer + * @musb: the controller + * @rx: dma channel + * @onepacket: true unless caller treats short reads as errors, and + * performs fault recovery above usbcore. + * Context: controller irqlocked + * + * See above notes about why we can't use multi-BD RX queues except in + * rare cases (mass storage class), and can never use the hardware "rndis" + * mode (since it's not a "true" RNDIS mode) with complete safety.. + * + * It's ESSENTIAL that callers specify "onepacket" mode unless they kick in + * code to recover from corrupted datastreams after each short transfer. + */ +static void +cppi_next_rx_segment(struct musb *musb, struct cppi_channel *rx, int onepacket) +{ + unsigned maxpacket = rx->maxpacket; + dma_addr_t addr = rx->buf_dma + rx->offset; + size_t length = rx->buf_len - rx->offset; + struct cppi_descriptor *bd, *tail; + unsigned n_bds; + unsigned i; + void __iomem *tibase = musb->ctrl_base; + int is_rndis = 0; + struct cppi_rx_stateram __iomem *rx_ram = rx->state_ram; + + if (onepacket) { + /* almost every USB driver, host or peripheral side */ + n_bds = 1; + + /* maybe apply the heuristic above */ + if (cppi_rx_rndis + && is_peripheral_active(musb) + && length > maxpacket + && (length & ~0xffff) == 0 + && (length & 0x0fff) != 0 + && (length & (maxpacket - 1)) == 0) { + maxpacket = length; + is_rndis = 1; + } + } else { + /* virtually nothing except mass storage class */ + if (length > 0xffff) { + n_bds = 0xffff / maxpacket; + length = n_bds * maxpacket; + } else { + n_bds = length / maxpacket; + if (length % maxpacket) + n_bds++; + } + if (n_bds == 1) + onepacket = 1; + else + n_bds = min(n_bds, (unsigned) NUM_RXCHAN_BD); + } + + /* In host mode, autorequest logic can generate some IN tokens; it's + * tricky since we can't leave REQPKT set in RXCSR after the transfer + * finishes. So: multipacket transfers involve two or more segments. + * And always at least two IRQs ... RNDIS mode is not an option. + */ + if (is_host_active(musb)) + n_bds = cppi_autoreq_update(rx, tibase, onepacket, n_bds); + + cppi_rndis_update(rx, 1, musb->ctrl_base, is_rndis); + + length = min(n_bds * maxpacket, length); + + DBG(4, "RX DMA%d seg, maxp %d %s bds %d (cnt %d) " + "dma 0x%x len %u %u/%u\n", + rx->index, maxpacket, + onepacket + ? (is_rndis ? "rndis" : "onepacket") + : "multipacket", + n_bds, + musb_readl(tibase, + DAVINCI_RXCPPI_BUFCNT0_REG + (rx->index * 4)) + & 0xffff, + addr, length, rx->channel.actual_len, rx->buf_len); + + /* only queue one segment at a time, since the hardware prevents + * correct queue shutdown after unexpected short packets + */ + bd = cppi_bd_alloc(rx); + rx->head = bd; + + /* Build BDs for all packets in this segment */ + for (i = 0, tail = NULL; bd && i < n_bds; i++, tail = bd) { + u32 bd_len; + + if (i) { + bd = cppi_bd_alloc(rx); + if (!bd) + break; + tail->next = bd; + tail->hw_next = bd->dma; + } + bd->hw_next = 0; + + /* all but the last packet will be maxpacket size */ + if (maxpacket < length) + bd_len = maxpacket; + else + bd_len = length; + + bd->hw_bufp = addr; + addr += bd_len; + rx->offset += bd_len; + + bd->hw_off_len = (0 /*offset*/ << 16) + bd_len; + bd->buflen = bd_len; + + bd->hw_options = CPPI_OWN_SET | (i == 0 ? length : 0); + length -= bd_len; + } + + /* we always expect at least one reusable BD! */ + if (!tail) { + WARNING("rx dma%d -- no BDs? need %d\n", rx->index, n_bds); + return; + } else if (i < n_bds) + WARNING("rx dma%d -- only %d of %d BDs\n", rx->index, i, n_bds); + + tail->next = NULL; + tail->hw_next = 0; + + bd = rx->head; + rx->tail = tail; + + /* short reads and other faults should terminate this entire + * dma segment. we want one "dma packet" per dma segment, not + * one per USB packet, terminating the whole queue at once... + * NOTE that current hardware seems to ignore SOP and EOP. + */ + bd->hw_options |= CPPI_SOP_SET; + tail->hw_options |= CPPI_EOP_SET; + + if (debug >= 5) { + struct cppi_descriptor *d; + + for (d = rx->head; d; d = d->next) + cppi_dump_rxbd("S", d); + } + + /* in case the preceding transfer left some state... */ + tail = rx->last_processed; + if (tail) { + tail->next = bd; + tail->hw_next = bd->dma; + } + + core_rxirq_enable(tibase, rx->index + 1); + + /* BDs live in DMA-coherent memory, but writes might be pending */ + cpu_drain_writebuffer(); + + /* REVISIT specs say to write this AFTER the BUFCNT register + * below ... but that loses badly. + */ + musb_writel(&rx_ram->rx_head, 0, bd->dma); + + /* bufferCount must be at least 3, and zeroes on completion + * unless it underflows below zero, or stops at two, or keeps + * growing ... grr. + */ + i = musb_readl(tibase, + DAVINCI_RXCPPI_BUFCNT0_REG + (rx->index * 4)) + & 0xffff; + + if (!i) + musb_writel(tibase, + DAVINCI_RXCPPI_BUFCNT0_REG + (rx->index * 4), + n_bds + 2); + else if (n_bds > (i - 3)) + musb_writel(tibase, + DAVINCI_RXCPPI_BUFCNT0_REG + (rx->index * 4), + n_bds - (i - 3)); + + i = musb_readl(tibase, + DAVINCI_RXCPPI_BUFCNT0_REG + (rx->index * 4)) + & 0xffff; + if (i < (2 + n_bds)) { + DBG(2, "bufcnt%d underrun - %d (for %d)\n", + rx->index, i, n_bds); + musb_writel(tibase, + DAVINCI_RXCPPI_BUFCNT0_REG + (rx->index * 4), + n_bds + 2); + } + + cppi_dump_rx(4, rx, "/S"); +} + +/** + * cppi_channel_program - program channel for data transfer + * @ch: the channel + * @maxpacket: max packet size + * @mode: For RX, 1 unless the usb protocol driver promised to treat + * all short reads as errors and kick in high level fault recovery. + * For TX, ignored because of RNDIS mode races/glitches. + * @dma_addr: dma address of buffer + * @len: length of buffer + * Context: controller irqlocked + */ +static int cppi_channel_program(struct dma_channel *ch, + u16 maxpacket, u8 mode, + dma_addr_t dma_addr, u32 len) +{ + struct cppi_channel *cppi_ch; + struct cppi *controller; + struct musb *musb; + + cppi_ch = container_of(ch, struct cppi_channel, channel); + controller = cppi_ch->controller; + musb = controller->musb; + + switch (ch->status) { + case MUSB_DMA_STATUS_BUS_ABORT: + case MUSB_DMA_STATUS_CORE_ABORT: + /* fault irq handler should have handled cleanup */ + WARNING("%cX DMA%d not cleaned up after abort!\n", + cppi_ch->transmit ? 'T' : 'R', + cppi_ch->index); + /* WARN_ON(1); */ + break; + case MUSB_DMA_STATUS_BUSY: + WARNING("program active channel? %cX DMA%d\n", + cppi_ch->transmit ? 'T' : 'R', + cppi_ch->index); + /* WARN_ON(1); */ + break; + case MUSB_DMA_STATUS_UNKNOWN: + DBG(1, "%cX DMA%d not allocated!\n", + cppi_ch->transmit ? 'T' : 'R', + cppi_ch->index); + /* FALLTHROUGH */ + case MUSB_DMA_STATUS_FREE: + break; + } + + ch->status = MUSB_DMA_STATUS_BUSY; + + /* set transfer parameters, then queue up its first segment */ + cppi_ch->buf_dma = dma_addr; + cppi_ch->offset = 0; + cppi_ch->maxpacket = maxpacket; + cppi_ch->buf_len = len; + + /* TX channel? or RX? */ + if (cppi_ch->transmit) + cppi_next_tx_segment(musb, cppi_ch); + else + cppi_next_rx_segment(musb, cppi_ch, mode); + + return true; +} + +static bool cppi_rx_scan(struct cppi *cppi, unsigned ch) +{ + struct cppi_channel *rx = &cppi->rx[ch]; + struct cppi_rx_stateram __iomem *state = rx->state_ram; + struct cppi_descriptor *bd; + struct cppi_descriptor *last = rx->last_processed; + bool completed = false; + bool acked = false; + int i; + dma_addr_t safe2ack; + void __iomem *regs = rx->hw_ep->regs; + + cppi_dump_rx(6, rx, "/K"); + + bd = last ? last->next : rx->head; + if (!bd) + return false; + + /* run through all completed BDs */ + for (i = 0, safe2ack = musb_readl(&state->rx_complete, 0); + (safe2ack || completed) && bd && i < NUM_RXCHAN_BD; + i++, bd = bd->next) { + u16 len; + + /* catch latest BD writes from CPPI */ + rmb(); + if (!completed && (bd->hw_options & CPPI_OWN_SET)) + break; + + DBG(5, "C/RXBD %08x: nxt %08x buf %08x " + "off.len %08x opt.len %08x (%d)\n", + bd->dma, bd->hw_next, bd->hw_bufp, + bd->hw_off_len, bd->hw_options, + rx->channel.actual_len); + + /* actual packet received length */ + if ((bd->hw_options & CPPI_SOP_SET) && !completed) + len = bd->hw_off_len & CPPI_RECV_PKTLEN_MASK; + else + len = 0; + + if (bd->hw_options & CPPI_EOQ_MASK) + completed = true; + + if (!completed && len < bd->buflen) { + /* NOTE: when we get a short packet, RXCSR_H_REQPKT + * must have been cleared, and no more DMA packets may + * active be in the queue... TI docs didn't say, but + * CPPI ignores those BDs even though OWN is still set. + */ + completed = true; + DBG(3, "rx short %d/%d (%d)\n", + len, bd->buflen, + rx->channel.actual_len); + } + + /* If we got here, we expect to ack at least one BD; meanwhile + * CPPI may completing other BDs while we scan this list... + * + * RACE: we can notice OWN cleared before CPPI raises the + * matching irq by writing that BD as the completion pointer. + * In such cases, stop scanning and wait for the irq, avoiding + * lost acks and states where BD ownership is unclear. + */ + if (bd->dma == safe2ack) { + musb_writel(&state->rx_complete, 0, safe2ack); + safe2ack = musb_readl(&state->rx_complete, 0); + acked = true; + if (bd->dma == safe2ack) + safe2ack = 0; + } + + rx->channel.actual_len += len; + + cppi_bd_free(rx, last); + last = bd; + + /* stop scanning on end-of-segment */ + if (bd->hw_next == 0) + completed = true; + } + rx->last_processed = last; + + /* dma abort, lost ack, or ... */ + if (!acked && last) { + int csr; + + if (safe2ack == 0 || safe2ack == rx->last_processed->dma) + musb_writel(&state->rx_complete, 0, safe2ack); + if (safe2ack == 0) { + cppi_bd_free(rx, last); + rx->last_processed = NULL; + + /* if we land here on the host side, H_REQPKT will + * be clear and we need to restart the queue... + */ + WARN_ON(rx->head); + } + musb_ep_select(cppi->mregs, rx->index + 1); + csr = musb_readw(regs, MUSB_RXCSR); + if (csr & MUSB_RXCSR_DMAENAB) { + DBG(4, "list%d %p/%p, last %08x%s, csr %04x\n", + rx->index, + rx->head, rx->tail, + rx->last_processed + ? rx->last_processed->dma + : 0, + c |