From 0e9913362a967377eb886bbdf305ec58aa07a878 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 6 May 2008 15:03:38 -0700 Subject: RDMA/cxgb3: Don't add PBL memory to gen_pool in chunks Current iw_cxgb3 code adds PBL memory to the driver's gen_pool in 2 MB chunks. This limits the largest single allocation that can be done to the same size, which means that with 4 KB pages, each of which takes 8 bytes of PBL memory, the largest memory region that can be allocated is 1 GB (256K PBL entries * 4 KB/entry). Remove this limit by adding all the PBL memory in a single gen_pool chunk, if possible. Add code that falls back to smaller chunks if gen_pool_add() fails, which can happen if there is not sufficient contiguous lowmem for the internal gen_pool bitmap. Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb3/cxio_resource.c | 36 ++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c index 45ed4f25ef7..bd233c08765 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_resource.c +++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c @@ -250,7 +250,6 @@ void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp) */ #define MIN_PBL_SHIFT 8 /* 256B == min PBL size (32 entries) */ -#define PBL_CHUNK 2*1024*1024 u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size) { @@ -267,14 +266,35 @@ void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size) int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p) { - unsigned long i; + unsigned pbl_start, pbl_chunk; + rdev_p->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1); - if (rdev_p->pbl_pool) - for (i = rdev_p->rnic_info.pbl_base; - i <= rdev_p->rnic_info.pbl_top - PBL_CHUNK + 1; - i += PBL_CHUNK) - gen_pool_add(rdev_p->pbl_pool, i, PBL_CHUNK, -1); - return rdev_p->pbl_pool ? 0 : -ENOMEM; + if (!rdev_p->pbl_pool) + return -ENOMEM; + + pbl_start = rdev_p->rnic_info.pbl_base; + pbl_chunk = rdev_p->rnic_info.pbl_top - pbl_start + 1; + + while (pbl_start < rdev_p->rnic_info.pbl_top) { + pbl_chunk = min(rdev_p->rnic_info.pbl_top - pbl_start + 1, + pbl_chunk); + if (gen_pool_add(rdev_p->pbl_pool, pbl_start, pbl_chunk, -1)) { + PDBG("%s failed to add PBL chunk (%x/%x)\n", + __func__, pbl_start, pbl_chunk); + if (pbl_chunk <= 1024 << MIN_PBL_SHIFT) { + printk(KERN_WARNING MOD "%s: Failed to add all PBL chunks (%x/%x)\n", + __func__, pbl_start, rdev_p->rnic_info.pbl_top - pbl_start); + return 0; + } + pbl_chunk >>= 1; + } else { + PDBG("%s added PBL chunk (%x/%x)\n", + __func__, pbl_start, pbl_chunk); + pbl_start += pbl_chunk; + } + } + + return 0; } void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p) -- cgit v1.2.3-18-g5258 From 273748cc908a901d082b4da5a16b2541c9d78a02 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Tue, 6 May 2008 15:56:22 -0700 Subject: RDMA/cxgb3: Fix severe limit on userspace memory registration size Currently, iw_cxgb3 is severely limited on the amount of userspace memory that can be registered in in a single memory region, which causes big problems for applications that expect to be able to register 100s of MB. The problem is that the driver uses a single kmalloc()ed buffer to hold the physical buffer list (PBL) for the entire memory region during registration, which means that 8 bytes of contiguous memory are required for each page of memory being registered. For example, a 64 MB registration will require 128 KB of contiguous memory with 4 KB pages, and it unlikely that such an allocation will succeed on a busy system. This is purely a driver problem: the temporary page list buffer is not needed by the hardware, so we can fix this by writing the PBL to the hardware in page-sized chunks rather than all at once. We do this by splitting the memory registration operation up into several steps: - Allocate PBL space in adapter memory for the full registration - Copy PBL to adapter memory in chunks - Allocate STag and enable memory region This also allows several other cleanups to the __cxio_tpt_op() interface and related parts of the driver. This change leaves the reregister memory region and memory window operations broken, but they already didn't work due to other longstanding bugs, so fixing them will be left to a later patch. Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb3/cxio_hal.c | 90 ++++++++++++++--------------- drivers/infiniband/hw/cxgb3/cxio_hal.h | 8 +-- drivers/infiniband/hw/cxgb3/iwch_mem.c | 75 +++++++++++++++--------- drivers/infiniband/hw/cxgb3/iwch_provider.c | 68 +++++++++++++++++----- drivers/infiniband/hw/cxgb3/iwch_provider.h | 8 +-- 5 files changed, 155 insertions(+), 94 deletions(-) diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index 5fd8506a865..ebf9d3043f8 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -588,7 +588,7 @@ static int cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p) * caller aquires the ctrl_qp lock before the call */ static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr, - u32 len, void *data, int completion) + u32 len, void *data) { u32 i, nr_wqe, copy_len; u8 *copy_data; @@ -624,7 +624,7 @@ static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr, flag = 0; if (i == (nr_wqe - 1)) { /* last WQE */ - flag = completion ? T3_COMPLETION_FLAG : 0; + flag = T3_COMPLETION_FLAG; if (len % 32) utx_len = len / 32 + 1; else @@ -683,21 +683,20 @@ static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr, return 0; } -/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl, and pbl_size - * OUT: stag index, actual pbl_size, pbl_addr allocated. +/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl_size and pbl_addr + * OUT: stag index * TBD: shared memory region support */ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, u32 *stag, u8 stag_state, u32 pdid, enum tpt_mem_type type, enum tpt_mem_perm perm, - u32 zbva, u64 to, u32 len, u8 page_size, __be64 *pbl, - u32 *pbl_size, u32 *pbl_addr) + u32 zbva, u64 to, u32 len, u8 page_size, + u32 pbl_size, u32 pbl_addr) { int err; struct tpt_entry tpt; u32 stag_idx; u32 wptr; - int rereg = (*stag != T3_STAG_UNSET); stag_state = stag_state > 0; stag_idx = (*stag) >> 8; @@ -711,30 +710,8 @@ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n", __func__, stag_state, type, pdid, stag_idx); - if (reset_tpt_entry) - cxio_hal_pblpool_free(rdev_p, *pbl_addr, *pbl_size << 3); - else if (!rereg) { - *pbl_addr = cxio_hal_pblpool_alloc(rdev_p, *pbl_size << 3); - if (!*pbl_addr) { - return -ENOMEM; - } - } - mutex_lock(&rdev_p->ctrl_qp.lock); - /* write PBL first if any - update pbl only if pbl list exist */ - if (pbl) { - - PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", - __func__, *pbl_addr, rdev_p->rnic_info.pbl_base, - *pbl_size); - err = cxio_hal_ctrl_qp_write_mem(rdev_p, - (*pbl_addr >> 5), - (*pbl_size << 3), pbl, 0); - if (err) - goto ret; - } - /* write TPT entry */ if (reset_tpt_entry) memset(&tpt, 0, sizeof(tpt)); @@ -749,23 +726,23 @@ static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry, V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) | V_TPT_PAGE_SIZE(page_size)); tpt.rsvd_pbl_addr = reset_tpt_entry ? 0 : - cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, *pbl_addr)>>3)); + cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3)); tpt.len = cpu_to_be32(len); tpt.va_hi = cpu_to_be32((u32) (to >> 32)); tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL)); tpt.rsvd_bind_cnt_or_pstag = 0; tpt.rsvd_pbl_size = reset_tpt_entry ? 0 : - cpu_to_be32(V_TPT_PBL_SIZE((*pbl_size) >> 2)); + cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2)); } err = cxio_hal_ctrl_qp_write_mem(rdev_p, stag_idx + (rdev_p->rnic_info.tpt_base >> 5), - sizeof(tpt), &tpt, 1); + sizeof(tpt), &tpt); /* release the stag index to free pool */ if (reset_tpt_entry) cxio_hal_put_stag(rdev_p->rscp, stag_idx); -ret: + wptr = rdev_p->ctrl_qp.wptr; mutex_unlock(&rdev_p->ctrl_qp.lock); if (!err) @@ -776,44 +753,67 @@ ret: return err; } +int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl, + u32 pbl_addr, u32 pbl_size) +{ + u32 wptr; + int err; + + PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n", + __func__, pbl_addr, rdev_p->rnic_info.pbl_base, + pbl_size); + + mutex_lock(&rdev_p->ctrl_qp.lock); + err = cxio_hal_ctrl_qp_write_mem(rdev_p, pbl_addr >> 5, pbl_size << 3, + pbl); + wptr = rdev_p->ctrl_qp.wptr; + mutex_unlock(&rdev_p->ctrl_qp.lock); + if (err) + return err; + + if (wait_event_interruptible(rdev_p->ctrl_qp.waitq, + SEQ32_GE(rdev_p->ctrl_qp.rptr, + wptr))) + return -ERESTARTSYS; + + return 0; +} + int cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, - u8 page_size, __be64 *pbl, u32 *pbl_size, - u32 *pbl_addr) + u8 page_size, u32 pbl_size, u32 pbl_addr) { *stag = T3_STAG_UNSET; return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, - zbva, to, len, page_size, pbl, pbl_size, pbl_addr); + zbva, to, len, page_size, pbl_size, pbl_addr); } int cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, - u8 page_size, __be64 *pbl, u32 *pbl_size, - u32 *pbl_addr) + u8 page_size, u32 pbl_size, u32 pbl_addr) { return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm, - zbva, to, len, page_size, pbl, pbl_size, pbl_addr); + zbva, to, len, page_size, pbl_size, pbl_addr); } int cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size, u32 pbl_addr) { - return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL, - &pbl_size, &pbl_addr); + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, + pbl_size, pbl_addr); } int cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid) { - u32 pbl_size = 0; *stag = T3_STAG_UNSET; return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0, - NULL, &pbl_size, NULL); + 0, 0); } int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag) { - return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL, - NULL, NULL); + return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, + 0, 0); } int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr) diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h index 69ab08ebc68..6e128f6bab0 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.h +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h @@ -154,14 +154,14 @@ int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq, int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq, struct cxio_ucontext *uctx); int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode); +int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl, + u32 pbl_addr, u32 pbl_size); int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, - u8 page_size, __be64 *pbl, u32 *pbl_size, - u32 *pbl_addr); + u8 page_size, u32 pbl_size, u32 pbl_addr); int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid, enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len, - u8 page_size, __be64 *pbl, u32 *pbl_size, - u32 *pbl_addr); + u8 page_size, u32 pbl_size, u32 pbl_addr); int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size, u32 pbl_addr); int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid); diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c index 58c3d61bcd1..ec49a5cbdeb 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_mem.c +++ b/drivers/infiniband/hw/cxgb3/iwch_mem.c @@ -35,17 +35,26 @@ #include #include "cxio_hal.h" +#include "cxio_resource.h" #include "iwch.h" #include "iwch_provider.h" -int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, - struct iwch_mr *mhp, - int shift, - __be64 *page_list) +static void iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag) { - u32 stag; u32 mmid; + mhp->attr.state = 1; + mhp->attr.stag = stag; + mmid = stag >> 8; + mhp->ibmr.rkey = mhp->ibmr.lkey = stag; + insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid); + PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp); +} + +int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, + struct iwch_mr *mhp, int shift) +{ + u32 stag; if (cxio_register_phys_mem(&rhp->rdev, &stag, mhp->attr.pdid, @@ -53,28 +62,21 @@ int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, mhp->attr.zbva, mhp->attr.va_fbo, mhp->attr.len, - shift-12, - page_list, - &mhp->attr.pbl_size, &mhp->attr.pbl_addr)) + shift - 12, + mhp->attr.pbl_size, mhp->attr.pbl_addr)) return -ENOMEM; - mhp->attr.state = 1; - mhp->attr.stag = stag; - mmid = stag >> 8; - mhp->ibmr.rkey = mhp->ibmr.lkey = stag; - insert_handle(rhp, &rhp->mmidr, mhp, mmid); - PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp); + + iwch_finish_mem_reg(mhp, stag); + return 0; } int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, struct iwch_mr *mhp, int shift, - __be64 *page_list, int npages) { u32 stag; - u32 mmid; - /* We could support this... */ if (npages > mhp->attr.pbl_size) @@ -87,19 +89,40 @@ int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, mhp->attr.zbva, mhp->attr.va_fbo, mhp->attr.len, - shift-12, - page_list, - &mhp->attr.pbl_size, &mhp->attr.pbl_addr)) + shift - 12, + mhp->attr.pbl_size, mhp->attr.pbl_addr)) return -ENOMEM; - mhp->attr.state = 1; - mhp->attr.stag = stag; - mmid = stag >> 8; - mhp->ibmr.rkey = mhp->ibmr.lkey = stag; - insert_handle(rhp, &rhp->mmidr, mhp, mmid); - PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp); + + iwch_finish_mem_reg(mhp, stag); + + return 0; +} + +int iwch_alloc_pbl(struct iwch_mr *mhp, int npages) +{ + mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev, + npages << 3); + + if (!mhp->attr.pbl_addr) + return -ENOMEM; + + mhp->attr.pbl_size = npages; + return 0; } +void iwch_free_pbl(struct iwch_mr *mhp) +{ + cxio_hal_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, + mhp->attr.pbl_size << 3); +} + +int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset) +{ + return cxio_write_pbl(&mhp->rhp->rdev, pages, + mhp->attr.pbl_addr + (offset << 3), npages); +} + int build_phys_page_list(struct ib_phys_buf *buffer_list, int num_phys_buf, u64 *iova_start, diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index d07d3a377b5..8934178a23e 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -442,6 +442,7 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr) mmid = mhp->attr.stag >> 8; cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, mhp->attr.pbl_addr); + iwch_free_pbl(mhp); remove_handle(rhp, &rhp->mmidr, mmid); if (mhp->kva) kfree((void *) (unsigned long) mhp->kva); @@ -475,6 +476,8 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, if (!mhp) return ERR_PTR(-ENOMEM); + mhp->rhp = rhp; + /* First check that we have enough alignment */ if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) { ret = -EINVAL; @@ -492,7 +495,17 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, if (ret) goto err; - mhp->rhp = rhp; + ret = iwch_alloc_pbl(mhp, npages); + if (ret) { + kfree(page_list); + goto err_pbl; + } + + ret = iwch_write_pbl(mhp, page_list, npages, 0); + kfree(page_list); + if (ret) + goto err_pbl; + mhp->attr.pdid = php->pdid; mhp->attr.zbva = 0; @@ -502,12 +515,15 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, mhp->attr.len = (u32) total_size; mhp->attr.pbl_size = npages; - ret = iwch_register_mem(rhp, php, mhp, shift, page_list); - kfree(page_list); - if (ret) { - goto err; - } + ret = iwch_register_mem(rhp, php, mhp, shift); + if (ret) + goto err_pbl; + return &mhp->ibmr; + +err_pbl: + iwch_free_pbl(mhp); + err: kfree(mhp); return ERR_PTR(ret); @@ -560,7 +576,7 @@ static int iwch_reregister_phys_mem(struct ib_mr *mr, return ret; } - ret = iwch_reregister_mem(rhp, php, &mh, shift, page_list, npages); + ret = iwch_reregister_mem(rhp, php, &mh, shift, npages); kfree(page_list); if (ret) { return ret; @@ -602,6 +618,8 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mhp) return ERR_PTR(-ENOMEM); + mhp->rhp = rhp; + mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); if (IS_ERR(mhp->umem)) { err = PTR_ERR(mhp->umem); @@ -615,10 +633,14 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, list_for_each_entry(chunk, &mhp->umem->chunk_list, list) n += chunk->nents; - pages = kmalloc(n * sizeof(u64), GFP_KERNEL); + err = iwch_alloc_pbl(mhp, n); + if (err) + goto err; + + pages = (__be64 *) __get_free_page(GFP_KERNEL); if (!pages) { err = -ENOMEM; - goto err; + goto err_pbl; } i = n = 0; @@ -630,25 +652,38 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, pages[i++] = cpu_to_be64(sg_dma_address( &chunk->page_list[j]) + mhp->umem->page_size * k); + if (i == PAGE_SIZE / sizeof *pages) { + err = iwch_write_pbl(mhp, pages, i, n); + if (err) + goto pbl_done; + n += i; + i = 0; + } } } - mhp->rhp = rhp; + if (i) + err = iwch_write_pbl(mhp, pages, i, n); + +pbl_done: + free_page((unsigned long) pages); + if (err) + goto err_pbl; + mhp->attr.pdid = php->pdid; mhp->attr.zbva = 0; mhp->attr.perms = iwch_ib_to_tpt_access(acc); mhp->attr.va_fbo = virt; mhp->attr.page_size = shift - 12; mhp->attr.len = (u32) length; - mhp->attr.pbl_size = i; - err = iwch_register_mem(rhp, php, mhp, shift, pages); - kfree(pages); + + err = iwch_register_mem(rhp, php, mhp, shift); if (err) - goto err; + goto err_pbl; if (udata && !t3a_device(rhp)) { uresp.pbl_addr = (mhp->attr.pbl_addr - - rhp->rdev.rnic_info.pbl_base) >> 3; + rhp->rdev.rnic_info.pbl_base) >> 3; PDBG("%s user resp pbl_addr 0x%x\n", __func__, uresp.pbl_addr); @@ -661,6 +696,9 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return &mhp->ibmr; +err_pbl: + iwch_free_pbl(mhp); + err: ib_umem_release(mhp->umem); kfree(mhp); diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h index db5100d27ca..836163fc542 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.h +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h @@ -340,14 +340,14 @@ int iwch_quiesce_qps(struct iwch_cq *chp); int iwch_resume_qps(struct iwch_cq *chp); void stop_read_rep_timer(struct iwch_qp *qhp); int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, - struct iwch_mr *mhp, - int shift, - __be64 *page_list); + struct iwch_mr *mhp, int shift); int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, struct iwch_mr *mhp, int shift, - __be64 *page_list, int npages); +int iwch_alloc_pbl(struct iwch_mr *mhp, int npages); +void iwch_free_pbl(struct iwch_mr *mhp); +int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset); int build_phys_page_list(struct ib_phys_buf *buffer_list, int num_phys_buf, u64 *iova_start, -- cgit v1.2.3-18-g5258 From 5f51efc195dfb860c60fafb4e47fe4b7cad2626d Mon Sep 17 00:00:00 2001 From: Michael Albaugh Date: Wed, 7 May 2008 10:56:47 -0700 Subject: IB/ipath: Only warn about prototype chip during init We warn about prototype chips, but the function that checks for support is also called as a result of a get_portinfo request, which can clutter the logs. Restrict warning to only appear during initialization. Signed-off-by: Michael Albaugh Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_iba7220.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_iba7220.c b/drivers/infiniband/hw/ipath/ipath_iba7220.c index e3ec0d1bdf5..5f693de6654 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba7220.c +++ b/drivers/infiniband/hw/ipath/ipath_iba7220.c @@ -870,8 +870,9 @@ static int ipath_7220_boardname(struct ipath_devdata *dd, char *name, "revision %u.%u!\n", dd->ipath_majrev, dd->ipath_minrev); ret = 1; - } else if (dd->ipath_minrev == 1) { - /* Rev1 chips are prototype. Complain, but allow use */ + } else if (dd->ipath_minrev == 1 && + !(dd->ipath_flags & IPATH_INITTED)) { + /* Rev1 chips are prototype. Complain at init, but allow use */ ipath_dev_err(dd, "Unsupported hardware " "revision %u.%u, Contact support@qlogic.com\n", dd->ipath_majrev, dd->ipath_minrev); -- cgit v1.2.3-18-g5258 From 6e87d1500713767866db0668bbcec75719576f3c Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Wed, 7 May 2008 10:57:14 -0700 Subject: IB/ipath: Only increment SSN if WQE is put on send queue If a send work request has immediate errors and is not put on the send queue, we shouldn't update any of the QP state. The increment of the SSN wasn't obeying this. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index e63927cce5b..5015cd2e57b 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -396,7 +396,6 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr) wqe = get_swqe_ptr(qp, qp->s_head); wqe->wr = *wr; - wqe->ssn = qp->s_ssn++; wqe->length = 0; if (wr->num_sge) { acc = wr->opcode >= IB_WR_RDMA_READ ? @@ -422,6 +421,7 @@ static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr) goto bail_inval; } else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu) goto bail_inval; + wqe->ssn = qp->s_ssn++; qp->s_head = next; ret = 0; -- cgit v1.2.3-18-g5258 From b4d390d8d219452e5d4257c87134a6934d7fabeb Mon Sep 17 00:00:00 2001 From: Dave Olson Date: Wed, 7 May 2008 10:57:48 -0700 Subject: IB/ipath: Fix bug that can leave sends disabled after freeze recovery The semantics of cancel_sends changed, but the code using it was missed. Don't leave sends and pioavail updates disabled, and add a comment as to why the force update is needed. Signed-off-by: Dave Olson Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_intr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 1b58f4737c7..45c4c068ab1 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -933,11 +933,15 @@ void ipath_clear_freeze(struct ipath_devdata *dd) * therefore would not be sent, and eventually * might cause the process to run out of bufs */ - ipath_cancel_sends(dd, 0); + ipath_cancel_sends(dd, 1); ipath_write_kreg(dd, dd->ipath_kregs->kr_control, dd->ipath_control); - /* ensure pio avail updates continue */ + /* + * ensure pio avail updates continue (because the update + * won't have happened from cancel_sends because we were + * still in freeze + */ ipath_force_pio_avail_update(dd); /* -- cgit v1.2.3-18-g5258 From 2bfc8e9edf200aeeca18ee44bcbf6bce65438a42 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Wed, 7 May 2008 10:58:50 -0700 Subject: IB/ipath: Return the correct opcode for RDMA WRITE with immediate This patch fixes a bug in the RC responder which generates a completion entry with the wrong opcode when an RDMA WRITE with immediate is received. Signed-off-by: Ralph Campbell Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_rc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c index c405dfba553..08b11b56761 100644 --- a/drivers/infiniband/hw/ipath/ipath_rc.c +++ b/drivers/infiniband/hw/ipath/ipath_rc.c @@ -1746,7 +1746,11 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, qp->r_wrid_valid = 0; wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; - wc.opcode = IB_WC_RECV; + if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; wc.vendor_err = 0; wc.qp = &qp->ibqp; wc.src_qp = qp->remote_qpn; -- cgit v1.2.3-18-g5258 From 2889d1ef1240591fa4c72a6753e0a8d1c6e18140 Mon Sep 17 00:00:00 2001 From: Michael Albaugh Date: Wed, 7 May 2008 10:59:23 -0700 Subject: IB/ipath: Fix count of packets received by kernel The loop in ipath_kreceive() that processes packets increments the loop-index 'i' once too often, because the exit condition does not depend on it, and is checked after the increment. By adding a check for !last to the iterator in the for loop, we correct that in a way that is not so likely to be re-broken by changes in the loop body. Signed-off-by: Michael Albaugh Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_driver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index acf30c06a0c..f81dd4acdc6 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -1197,7 +1197,7 @@ void ipath_kreceive(struct ipath_portdata *pd) } reloop: - for (last = 0, i = 1; !last; i++) { + for (last = 0, i = 1; !last; i += !last) { hdr = dd->ipath_f_get_msgheader(dd, rhf_addr); eflags = ipath_hdrget_err_flags(rhf_addr); etype = ipath_hdrget_rcv_type(rhf_addr); -- cgit v1.2.3-18-g5258 From e2ab41cae418108f376ad1634d7507f56379f7a2 Mon Sep 17 00:00:00 2001 From: Dave Olson Date: Wed, 7 May 2008 11:00:15 -0700 Subject: IB/ipath: Need to always request and handle PIO avail interrupts Now that we always use PIO for vl15 on 7220, we could get stuck forever if we happened to run out of PIO buffers from the verbs code, because the setup code wouldn't run; the interrupt was also ignored if SDMA was supported. We also have to reduce the pio update threshold if we have fewer kernel buffers than the existing threshold. Clean up the initialization a bit to get ordering safer and more sensible, and use the existing ipath_chg_kernavail call to do init, rather than doing it separately. Drop unnecessary clearing of pio buffer on pio parity error. Drop incorrect updating of pioavailshadow when exitting freeze mode (software state may not match chip state if buffer has been allocated and not yet written). If we couldn't get a kernel buffer for a while, make sure we are in sync with hardware, mainly to handle the exitting freeze case. Signed-off-by: Dave Olson Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_driver.c | 128 +++++++++++++++++++++++--- drivers/infiniband/hw/ipath/ipath_file_ops.c | 72 ++++++--------- drivers/infiniband/hw/ipath/ipath_iba7220.c | 21 ++--- drivers/infiniband/hw/ipath/ipath_init_chip.c | 95 +++++++++---------- drivers/infiniband/hw/ipath/ipath_intr.c | 82 +++-------------- drivers/infiniband/hw/ipath/ipath_kernel.h | 8 +- drivers/infiniband/hw/ipath/ipath_ruc.c | 7 +- drivers/infiniband/hw/ipath/ipath_sdma.c | 13 ++- 8 files changed, 224 insertions(+), 202 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index f81dd4acdc6..2036d38fac4 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -1428,6 +1428,40 @@ static void ipath_update_pio_bufs(struct ipath_devdata *dd) spin_unlock_irqrestore(&ipath_pioavail_lock, flags); } +/* + * used to force update of pioavailshadow if we can't get a pio buffer. + * Needed primarily due to exitting freeze mode after recovering + * from errors. Done lazily, because it's safer (known to not + * be writing pio buffers). + */ +static void ipath_reset_availshadow(struct ipath_devdata *dd) +{ + int i, im; + unsigned long flags; + + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (i = 0; i < dd->ipath_pioavregs; i++) { + u64 val, oldval; + /* deal with 6110 chip bug on high register #s */ + im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? + i ^ 1 : i; + val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]); + /* + * busy out the buffers not in the kernel avail list, + * without changing the generation bits. + */ + oldval = dd->ipath_pioavailshadow[i]; + dd->ipath_pioavailshadow[i] = val | + ((~dd->ipath_pioavailkernel[i] << + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) & + 0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */ + if (oldval != dd->ipath_pioavailshadow[i]) + ipath_dbg("shadow[%d] was %Lx, now %lx\n", + i, oldval, dd->ipath_pioavailshadow[i]); + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); +} + /** * ipath_setrcvhdrsize - set the receive header size * @dd: the infinipath device @@ -1482,9 +1516,12 @@ static noinline void no_pio_bufs(struct ipath_devdata *dd) */ ipath_stats.sps_nopiobufs++; if (!(++dd->ipath_consec_nopiobuf % 100000)) { - ipath_dbg("%u pio sends with no bufavail; dmacopy: " - "%llx %llx %llx %llx; shadow: %lx %lx %lx %lx\n", + ipath_force_pio_avail_update(dd); /* at start */ + ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: " + "%llx %llx %llx %llx\n" + "ipath shadow: %lx %lx %lx %lx\n", dd->ipath_consec_nopiobuf, + (unsigned long)get_cycles(), (unsigned long long) le64_to_cpu(dma[0]), (unsigned long long) le64_to_cpu(dma[1]), (unsigned long long) le64_to_cpu(dma[2]), @@ -1496,14 +1533,17 @@ static noinline void no_pio_bufs(struct ipath_devdata *dd) */ if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) > (sizeof(shadow[0]) * 4 * 4)) - ipath_dbg("2nd group: dmacopy: %llx %llx " - "%llx %llx; shadow: %lx %lx %lx %lx\n", + ipath_dbg("2nd group: dmacopy: " + "%llx %llx %llx %llx\n" + "ipath shadow: %lx %lx %lx %lx\n", (unsigned long long)le64_to_cpu(dma[4]), (unsigned long long)le64_to_cpu(dma[5]), (unsigned long long)le64_to_cpu(dma[6]), (unsigned long long)le64_to_cpu(dma[7]), - shadow[4], shadow[5], shadow[6], - shadow[7]); + shadow[4], shadow[5], shadow[6], shadow[7]); + + /* at end, so update likely happened */ + ipath_reset_availshadow(dd); } } @@ -1652,19 +1692,46 @@ void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, unsigned len, int avail) { unsigned long flags; - unsigned end; + unsigned end, cnt = 0, next; /* There are two bits per send buffer (busy and generation) */ start *= 2; - len *= 2; - end = start + len; + end = start + len * 2; - /* Set or clear the generation bits. */ spin_lock_irqsave(&ipath_pioavail_lock, flags); + /* Set or clear the busy bit in the shadow. */ while (start < end) { if (avail) { - __clear_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT, - dd->ipath_pioavailshadow); + unsigned long dma; + int i, im; + /* + * the BUSY bit will never be set, because we disarm + * the user buffers before we hand them back to the + * kernel. We do have to make sure the generation + * bit is set correctly in shadow, since it could + * have changed many times while allocated to user. + * We can't use the bitmap functions on the full + * dma array because it is always little-endian, so + * we have to flip to host-order first. + * BITS_PER_LONG is slightly wrong, since it's + * always 64 bits per register in chip... + * We only work on 64 bit kernels, so that's OK. + */ + /* deal with 6110 chip bug on high register #s */ + i = start / BITS_PER_LONG; + im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? + i ^ 1 : i; + __clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT + + start, dd->ipath_pioavailshadow); + dma = (unsigned long) le64_to_cpu( + dd->ipath_pioavailregs_dma[im]); + if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start) % BITS_PER_LONG, &dma)) + __set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->ipath_pioavailshadow); + else + __clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->ipath_pioavailshadow); __set_bit(start, dd->ipath_pioavailkernel); } else { __set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT, @@ -1673,7 +1740,44 @@ void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, } start += 2; } + + if (dd->ipath_pioupd_thresh) { + end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); + next = find_first_bit(dd->ipath_pioavailkernel, end); + while (next < end) { + cnt++; + next = find_next_bit(dd->ipath_pioavailkernel, end, + next + 1); + } + } spin_unlock_irqrestore(&ipath_pioavail_lock, flags); + + /* + * When moving buffers from kernel to user, if number assigned to + * the user is less than the pio update threshold, and threshold + * is supported (cnt was computed > 0), drop the update threshold + * so we update at least once per allocated number of buffers. + * In any case, if the kernel buffers are less than the threshold, + * drop the threshold. We don't bother increasing it, having once + * decreased it, since it would typically just cycle back and forth. + * If we don't decrease below buffers in use, we can wait a long + * time for an update, until some other context uses PIO buffers. + */ + if (!avail && len < cnt) + cnt = len; + if (cnt < dd->ipath_pioupd_thresh) { + dd->ipath_pioupd_thresh = cnt; + ipath_dbg("Decreased pio update threshold to %u\n", + dd->ipath_pioupd_thresh); + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK + << INFINIPATH_S_UPDTHRESH_SHIFT); + dd->ipath_sendctrl |= dd->ipath_pioupd_thresh + << INFINIPATH_S_UPDTHRESH_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } } /** diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 8b1752202e7..3295177c937 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -173,47 +173,25 @@ static int ipath_get_base_info(struct file *fp, (void *) dd->ipath_statusp - (void *) dd->ipath_pioavailregs_dma; if (!shared) { - kinfo->spi_piocnt = dd->ipath_pbufsport; + kinfo->spi_piocnt = pd->port_piocnt; kinfo->spi_piobufbase = (u64) pd->port_piobufs; kinfo->__spi_uregbase = (u64) dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port; } else if (master) { - kinfo->spi_piocnt = (dd->ipath_pbufsport / subport_cnt) + - (dd->ipath_pbufsport % subport_cnt); + kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) + + (pd->port_piocnt % subport_cnt); /* Master's PIO buffers are after all the slave's */ kinfo->spi_piobufbase = (u64) pd->port_piobufs + dd->ipath_palign * - (dd->ipath_pbufsport - kinfo->spi_piocnt); + (pd->port_piocnt - kinfo->spi_piocnt); } else { unsigned slave = subport_fp(fp) - 1; - kinfo->spi_piocnt = dd->ipath_pbufsport / subport_cnt; + kinfo->spi_piocnt = pd->port_piocnt / subport_cnt; kinfo->spi_piobufbase = (u64) pd->port_piobufs + dd->ipath_palign * kinfo->spi_piocnt * slave; } - /* - * Set the PIO avail update threshold to no larger - * than the number of buffers per process. Note that - * we decrease it here, but won't ever increase it. - */ - if (dd->ipath_pioupd_thresh && - kinfo->spi_piocnt < dd->ipath_pioupd_thresh) { - unsigned long flags; - - dd->ipath_pioupd_thresh = kinfo->spi_piocnt; - ipath_dbg("Decreased pio update threshold to %u\n", - dd->ipath_pioupd_thresh); - spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); - dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK - << INFINIPATH_S_UPDTHRESH_SHIFT); - dd->ipath_sendctrl |= dd->ipath_pioupd_thresh - << INFINIPATH_S_UPDTHRESH_SHIFT; - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, - dd->ipath_sendctrl); - spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); - } - if (shared) { kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port; @@ -1309,19 +1287,19 @@ static int ipath_mmap(struct file *fp, struct vm_area_struct *vma) ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port; if (!pd->port_subport_cnt) { /* port is not shared */ - piocnt = dd->ipath_pbufsport; + piocnt = pd->port_piocnt; piobufs = pd->port_piobufs; } else if (!subport_fp(fp)) { /* caller is the master */ - piocnt = (dd->ipath_pbufsport / pd->port_subport_cnt) + - (dd->ipath_pbufsport % pd->port_subport_cnt); + piocnt = (pd->port_piocnt / pd->port_subport_cnt) + + (pd->port_piocnt % pd->port_subport_cnt); piobufs = pd->port_piobufs + - dd->ipath_palign * (dd->ipath_pbufsport - piocnt); + dd->ipath_palign * (pd->port_piocnt - piocnt); } else { unsigned slave = subport_fp(fp) - 1; /* caller is a slave */ - piocnt = dd->ipath_pbufsport / pd->port_subport_cnt; + piocnt = pd->port_piocnt / pd->port_subport_cnt; piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave; } @@ -1633,9 +1611,6 @@ static int try_alloc_port(struct ipath_devdata *dd, int port, port_fp(fp) = pd; pd->port_pid = current->pid; strncpy(pd->port_comm, current->comm, sizeof(pd->port_comm)); - ipath_chg_pioavailkernel(dd, - dd->ipath_pbufsport * (pd->port_port - 1), - dd->ipath_pbufsport, 0); ipath_stats.sps_ports++; ret = 0; } else @@ -1938,11 +1913,25 @@ static int ipath_do_user_init(struct file *fp, /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */ + /* some ports may get extra buffers, calculate that here */ + if (pd->port_port <= dd->ipath_ports_extrabuf) + pd->port_piocnt = dd->ipath_pbufsport + 1; + else + pd->port_piocnt = dd->ipath_pbufsport; + /* for right now, kernel piobufs are at end, so port 1 is at 0 */ + if (pd->port_port <= dd->ipath_ports_extrabuf) + pd->port_pio_base = (dd->ipath_pbufsport + 1) + * (pd->port_port - 1); + else + pd->port_pio_base = dd->ipath_ports_extrabuf + + dd->ipath_pbufsport * (pd->port_port - 1); pd->port_piobufs = dd->ipath_piobufbase + - dd->ipath_pbufsport * (pd->port_port - 1) * dd->ipath_palign; - ipath_cdbg(VERBOSE, "Set base of piobufs for port %u to 0x%x\n", - pd->port_port, pd->port_piobufs); + pd->port_pio_base * dd->ipath_palign; + ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u," + " first pio %u\n", pd->port_port, pd->port_piobufs, + pd->port_piocnt, pd->port_pio_base); + ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0); /* * Now allocate the rcvhdr Q and eager TIDs; skip the TID @@ -2107,7 +2096,6 @@ static int ipath_close(struct inode *in, struct file *fp) } if (dd->ipath_kregbase) { - int i; /* atomically clear receive enable port and intr avail. */ clear_bit(dd->ipath_r_portenable_shift + port, &dd->ipath_rcvctrl); @@ -2136,9 +2124,9 @@ static int ipath_close(struct inode *in, struct file *fp) ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr, pd->port_port, dd->ipath_dummy_hdrq_phys); - i = dd->ipath_pbufsport * (port - 1); - ipath_disarm_piobufs(dd, i, dd->ipath_pbufsport); - ipath_chg_pioavailkernel(dd, i, dd->ipath_pbufsport, 1); + ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt); + ipath_chg_pioavailkernel(dd, pd->port_pio_base, + pd->port_piocnt, 1); dd->ipath_f_clear_tids(dd, pd->port_port); diff --git a/drivers/infiniband/hw/ipath/ipath_iba7220.c b/drivers/infiniband/hw/ipath/ipath_iba7220.c index 5f693de6654..8eee7830f04 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba7220.c +++ b/drivers/infiniband/hw/ipath/ipath_iba7220.c @@ -595,7 +595,7 @@ static void ipath_7220_txe_recover(struct ipath_devdata *dd) dev_info(&dd->pcidev->dev, "Recovering from TXE PIO parity error\n"); - ipath_disarm_senderrbufs(dd, 1); + ipath_disarm_senderrbufs(dd); } @@ -675,10 +675,8 @@ static void ipath_7220_handle_hwerrors(struct ipath_devdata *dd, char *msg, ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) { /* - * Parity errors in send memory are recoverable, - * just cancel the send (if indicated in * sendbuffererror), - * count the occurrence, unfreeze (if no other handled - * hardware error bits are set), and continue. + * Parity errors in send memory are recoverable by h/w + * just do housekeeping, exit freeze mode and continue. */ if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) @@ -687,13 +685,6 @@ static void ipath_7220_handle_hwerrors(struct ipath_devdata *dd, char *msg, hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT); - if (!hwerrs) { - /* else leave in freeze mode */ - ipath_write_kreg(dd, - dd->ipath_kregs->kr_control, - dd->ipath_control); - goto bail; - } } if (hwerrs) { /* @@ -723,8 +714,8 @@ static void ipath_7220_handle_hwerrors(struct ipath_devdata *dd, char *msg, *dd->ipath_statusp |= IPATH_STATUS_HWERROR; dd->ipath_flags &= ~IPATH_INITTED; } else { - ipath_dbg("Clearing freezemode on ignored hardware " - "error\n"); + ipath_dbg("Clearing freezemode on ignored or " + "recovered hardware error\n"); ipath_clear_freeze(dd); } } @@ -1967,7 +1958,7 @@ static void ipath_7220_config_ports(struct ipath_devdata *dd, ushort cfgports) dd->ipath_rcvctrl); dd->ipath_p0_rcvegrcnt = 2048; /* always */ if (dd->ipath_flags & IPATH_HAS_SEND_DMA) - dd->ipath_pioreserved = 1; /* reserve a buffer */ + dd->ipath_pioreserved = 3; /* kpiobufs used for PIO */ } diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c index 27dd8947666..3e5baa43fc8 100644 --- a/drivers/infiniband/hw/ipath/ipath_init_chip.c +++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c @@ -41,7 +41,7 @@ /* * min buffers we want to have per port, after driver */ -#define IPATH_MIN_USER_PORT_BUFCNT 8 +#define IPATH_MIN_USER_PORT_BUFCNT 7 /* * Number of ports we are configured to use (to allow for more pio @@ -54,13 +54,9 @@ MODULE_PARM_DESC(cfgports, "Set max number of ports to use"); /* * Number of buffers reserved for driver (verbs and layered drivers.) - * Reserved at end of buffer list. Initialized based on - * number of PIO buffers if not set via module interface. + * Initialized based on number of PIO buffers if not set via module interface. * The problem with this is that it's global, but we'll use different - * numbers for different chip types. So the default value is not - * very useful. I've redefined it for the 1.3 release so that it's - * zero unless set by the user to something else, in which case we - * try to respect it. + * numbers for different chip types. */ static ushort ipath_kpiobufs; @@ -546,9 +542,12 @@ static void enable_chip(struct ipath_devdata *dd, int reinit) pioavail = dd->ipath_pioavailregs_dma[i ^ 1]; else pioavail = dd->ipath_pioavailregs_dma[i]; - dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail) | - (~dd->ipath_pioavailkernel[i] << - INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT); + /* + * don't need to worry about ipath_pioavailkernel here + * because we will call ipath_chg_pioavailkernel() later + * in initialization, to busy out buffers as needed + */ + dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail); } /* can get counters, stats, etc. */ dd->ipath_flags |= IPATH_PRESENT; @@ -708,12 +707,11 @@ static void verify_interrupt(unsigned long opaque) int ipath_init_chip(struct ipath_devdata *dd, int reinit) { int ret = 0; - u32 val32, kpiobufs; + u32 kpiobufs, defkbufs; u32 piobufs, uports; u64 val; struct ipath_portdata *pd; gfp_t gfp_flags = GFP_USER | __GFP_COMP; - unsigned long flags; ret = init_housekeeping(dd, reinit); if (ret) @@ -753,69 +751,52 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) / (sizeof(u64) * BITS_PER_BYTE / 2); uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0; - if (ipath_kpiobufs == 0) { - /* not set by user (this is default) */ - if (piobufs > 144) - kpiobufs = 32; - else - kpiobufs = 16; - } + if (piobufs > 144) + defkbufs = 32 + dd->ipath_pioreserved; else - kpiobufs = ipath_kpiobufs; + defkbufs = 16 + dd->ipath_pioreserved; - if (kpiobufs + (uports * IPATH_MIN_USER_PORT_BUFCNT) > piobufs) { + if (ipath_kpiobufs && (ipath_kpiobufs + + (uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) { int i = (int) piobufs - (int) (uports * IPATH_MIN_USER_PORT_BUFCNT); if (i < 1) i = 1; dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of " "%d for kernel leaves too few for %d user ports " - "(%d each); using %u\n", kpiobufs, + "(%d each); using %u\n", ipath_kpiobufs, piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i); /* * shouldn't change ipath_kpiobufs, because could be * different for different devices... */ kpiobufs = i; - } + } else if (ipath_kpiobufs) + kpiobufs = ipath_kpiobufs; + else + kpiobufs = defkbufs; dd->ipath_lastport_piobuf = piobufs - kpiobufs; dd->ipath_pbufsport = uports ? dd->ipath_lastport_piobuf / uports : 0; - val32 = dd->ipath_lastport_piobuf - (dd->ipath_pbufsport * uports); - if (val32 > 0) { - ipath_dbg("allocating %u pbufs/port leaves %u unused, " - "add to kernel\n", dd->ipath_pbufsport, val32); - dd->ipath_lastport_piobuf -= val32; - kpiobufs += val32; - ipath_dbg("%u pbufs/port leaves %u unused, add to kernel\n", - dd->ipath_pbufsport, val32); - } + /* if not an even divisor, some user ports get extra buffers */ + dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf - + (dd->ipath_pbufsport * uports); + if (dd->ipath_ports_extrabuf) + ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to " + "ports <= %u\n", dd->ipath_pbufsport, + dd->ipath_ports_extrabuf); dd->ipath_lastpioindex = 0; dd->ipath_lastpioindexl = dd->ipath_piobcnt2k; - ipath_chg_pioavailkernel(dd, 0, piobufs, 1); + /* ipath_pioavailshadow initialized earlier */ ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u " "each for %u user ports\n", kpiobufs, piobufs, dd->ipath_pbufsport, uports); - if (dd->ipath_pioupd_thresh) { - if (dd->ipath_pbufsport < dd->ipath_pioupd_thresh) - dd->ipath_pioupd_thresh = dd->ipath_pbufsport; - if (kpiobufs < dd->ipath_pioupd_thresh) - dd->ipath_pioupd_thresh = kpiobufs; - } - ret = dd->ipath_f_early_init(dd); if (ret) { ipath_dev_err(dd, "Early initialization failure\n"); goto done; } - /* - * Cancel any possible active sends from early driver load. - * Follows early_init because some chips have to initialize - * PIO buffers in early_init to avoid false parity errors. - */ - ipath_cancel_sends(dd, 0); - /* * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be * done after early_init. @@ -836,6 +817,7 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr, dd->ipath_pioavailregs_phys); + /* * this is to detect s/w errors, which the h/w works around by * ignoring the low 6 bits of address, if it wasn't aligned. @@ -862,12 +844,6 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED); ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL); - spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); - dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE; - ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); - ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); - spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); - /* * before error clears, since we expect serdes pll errors during * this, the first time after reset @@ -940,6 +916,19 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit) else enable_chip(dd, reinit); + /* after enable_chip, so pioavailshadow setup */ + ipath_chg_pioavailkernel(dd, 0, piobufs, 1); + + /* + * Cancel any possible active sends from early driver load. + * Follows early_init because some chips have to initialize + * PIO buffers in early_init to avoid false parity errors. + * After enable and ipath_chg_pioavailkernel so we can safely + * enable pioavail updates and PIOENABLE; packets are now + * ready to go out. + */ + ipath_cancel_sends(dd, 1); + if (!reinit) { /* * Used when we close a port, for DMA already in flight diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 45c4c068ab1..26900b3b7a4 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -38,42 +38,12 @@ #include "ipath_verbs.h" #include "ipath_common.h" -/* - * clear (write) a pio buffer, to clear a parity error. This routine - * should only be called when in freeze mode, and the buffer should be - * canceled afterwards. - */ -static void ipath_clrpiobuf(struct ipath_devdata *dd, u32 pnum) -{ - u32 __iomem *pbuf; - u32 dwcnt; /* dword count to write */ - if (pnum < dd->ipath_piobcnt2k) { - pbuf = (u32 __iomem *) (dd->ipath_pio2kbase + pnum * - dd->ipath_palign); - dwcnt = dd->ipath_piosize2k >> 2; - } - else { - pbuf = (u32 __iomem *) (dd->ipath_pio4kbase + - (pnum - dd->ipath_piobcnt2k) * dd->ipath_4kalign); - dwcnt = dd->ipath_piosize4k >> 2; - } - dev_info(&dd->pcidev->dev, - "Rewrite PIO buffer %u, to recover from parity error\n", - pnum); - - /* no flush required, since already in freeze */ - writel(dwcnt + 1, pbuf); - while (--dwcnt) - writel(0, pbuf++); -} /* * Called when we might have an error that is specific to a particular * PIO buffer, and may need to cancel that buffer, so it can be re-used. - * If rewrite is true, and bits are set in the sendbufferror registers, - * we'll write to the buffer, for error recovery on parity errors. */ -void ipath_disarm_senderrbufs(struct ipath_devdata *dd, int rewrite) +void ipath_disarm_senderrbufs(struct ipath_devdata *dd) { u32 piobcnt; unsigned long sbuf[4]; @@ -109,11 +79,8 @@ void ipath_disarm_senderrbufs(struct ipath_devdata *dd, int rewrite) } for (i = 0; i < piobcnt; i++) - if (test_bit(i, sbuf)) { - if (rewrite) - ipath_clrpiobuf(dd, i); + if (test_bit(i, sbuf)) ipath_disarm_piobufs(dd, i, 1); - } /* ignore armlaunch errs for a bit */ dd->ipath_lastcancel = jiffies+3; } @@ -164,7 +131,7 @@ static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs) { u64 ignore_this_time = 0; - ipath_disarm_senderrbufs(dd, 0); + ipath_disarm_senderrbufs(dd); if ((errs & E_SUM_LINK_PKTERRS) && !(dd->ipath_flags & IPATH_LINKACTIVE)) { /* @@ -909,8 +876,8 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) * processes (causing armlaunch), send errors due to going into freeze mode, * etc., and try to avoid causing extra interrupts while doing so. * Forcibly update the in-memory pioavail register copies after cleanup - * because the chip won't do it for anything changing while in freeze mode - * (we don't want to wait for the next pio buffer state change). + * because the chip won't do it while in freeze mode (the register values + * themselves are kept correct). * Make sure that we don't lose any important interrupts by using the chip * feature that says that writing 0 to a bit in *clear that is set in * *status will cause an interrupt to be generated again (if allowed by @@ -918,47 +885,22 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) */ void ipath_clear_freeze(struct ipath_devdata *dd) { - int i, im; - u64 val; - /* disable error interrupts, to avoid confusion */ ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL); /* also disable interrupts; errormask is sometimes overwriten */ ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); - /* - * clear all sends, because they have may been - * completed by usercode while in freeze mode, and - * therefore would not be sent, and eventually - * might cause the process to run out of bufs - */ ipath_cancel_sends(dd, 1); + + /* clear the freeze, and be sure chip saw it */ ipath_write_kreg(dd, dd->ipath_kregs->kr_control, dd->ipath_control); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); - /* - * ensure pio avail updates continue (because the update - * won't have happened from cancel_sends because we were - * still in freeze - */ + /* force in-memory update now we are out of freeze */ ipath_force_pio_avail_update(dd); - /* - * We just enabled pioavailupdate, so dma copy is almost certainly - * not yet right, so read the registers directly. Similar to init - */ - for (i = 0; i < dd->ipath_pioavregs; i++) { - /* deal with 6110 chip bug */ - im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? - i ^ 1 : i; - val = ipath_read_kreg64(dd, (0x1000 / sizeof(u64)) + im); - dd->ipath_pioavailregs_dma[i] = cpu_to_le64(val); - dd->ipath_pioavailshadow[i] = val | - (~dd->ipath_pioavailkernel[i] << - INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT); - } - /* * force new interrupt if any hwerr, error or interrupt bits are * still set, and clear "safe" send packet errors related to freeze @@ -1316,10 +1258,8 @@ irqreturn_t ipath_intr(int irq, void *data) ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); - if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA)) - handle_layer_pioavail(dd); - else - ipath_dbg("unexpected BUFAVAIL intr\n"); + /* always process; sdma verbs uses PIO for acks and VL15 */ + handle_layer_pioavail(dd); } ret = IRQ_HANDLED; diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h index 202337ae90d..02b24a34059 100644 --- a/drivers/infiniband/hw/ipath/ipath_kernel.h +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h @@ -117,6 +117,10 @@ struct ipath_portdata { u16 port_subport_cnt; /* non-zero if port is being shared. */ u16 port_subport_id; + /* number of pio bufs for this port (all procs, if shared) */ + u32 port_piocnt; + /* first pio buffer for this port */ + u32 port_pio_base; /* chip offset of PIO buffers for this port */ u32 port_piobufs; /* how many alloc_pages() chunks in port_rcvegrbuf_pages */ @@ -384,6 +388,8 @@ struct ipath_devdata { u32 ipath_lastrpkts; /* pio bufs allocated per port */ u32 ipath_pbufsport; + /* if remainder on bufs/port, ports < extrabuf get 1 extra */ + u32 ipath_ports_extrabuf; u32 ipath_pioupd_thresh; /* update threshold, some chips */ /* * number of ports configured as max; zero is set to number chip @@ -1011,7 +1017,7 @@ void ipath_get_eeprom_info(struct ipath_devdata *); int ipath_update_eeprom_log(struct ipath_devdata *dd); void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr); u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg); -void ipath_disarm_senderrbufs(struct ipath_devdata *, int); +void ipath_disarm_senderrbufs(struct ipath_devdata *); void ipath_force_pio_avail_update(struct ipath_devdata *); void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev); diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c index 8ac5c1d82cc..9e3fe61cbd0 100644 --- a/drivers/infiniband/hw/ipath/ipath_ruc.c +++ b/drivers/infiniband/hw/ipath/ipath_ruc.c @@ -481,9 +481,10 @@ done: wake_up(&qp->wait); } -static void want_buffer(struct ipath_devdata *dd) +static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp) { - if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA)) { + if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) || + qp->ibqp.qp_type == IB_QPT_SMI) { unsigned long flags; spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); @@ -519,7 +520,7 @@ static void ipath_no_bufs_available(struct ipath_qp *qp, spin_lock_irqsave(&dev->pending_lock, flags); list_add_tail(&qp->piowait, &dev->piowait); spin_unlock_irqrestore(&dev->pending_lock, flags); - want_buffer(dev->dd); + want_buffer(dev->dd, qp); dev->n_piowait++; } diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c index 1974df7a9f7..0d07682c731 100644 --- a/drivers/infiniband/hw/ipath/ipath_sdma.c +++ b/drivers/infiniband/hw/ipath/ipath_sdma.c @@ -449,16 +449,19 @@ int setup_sdma(struct ipath_devdata *dd) ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, dd->ipath_sdma_head_phys); - /* Reserve all the former "kernel" piobufs */ - n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - dd->ipath_pioreserved; - for (i = dd->ipath_lastport_piobuf; i < n; ++i) { + /* + * Reserve all the former "kernel" piobufs, using high number range + * so we get as many 4K buffers as possible + */ + n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved; + ipath_chg_pioavailkernel(dd, i, n - i , 0); + for (; i < n; ++i) { unsigned word = i / 64; unsigned bit = i & 63; BUG_ON(word >= 3); senddmabufmask[word] |= 1ULL << bit; } - ipath_chg_pioavailkernel(dd, dd->ipath_lastport_piobuf, - n - dd->ipath_lastport_piobuf, 0); ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, senddmabufmask[0]); ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, -- cgit v1.2.3-18-g5258 From ab69b3cf1219e0d07bb4ea373f36b1de38af531c Mon Sep 17 00:00:00 2001 From: John Gregor Date: Wed, 7 May 2008 11:01:10 -0700 Subject: IB/ipath: Fix SDMA error recovery in absence of link status change What's fixed: in ipath_cancel_sends() We need to unconditionally set ABORTING. So, swap the tests so the set_bit() isn't shadowed by the &&. If we've disarmed the piobufs, then we need to unconditionally set DISARMED. So, move it out from the overly protective if at the bottom. in sdma_abort_task() Abort_task was written knowing that the SDMA engine would always be reset (and restarted) on error. A recent change broke that fundamental assumption by taking the restart portion and making it conditional on a link status change. But, SDMA can go boom without a link status change in some conditions. Signed-off-by: John Gregor Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_driver.c | 8 +++++--- drivers/infiniband/hw/ipath/ipath_sdma.c | 31 +++++++++++++++++++++++------- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 2036d38fac4..ce7b7c34360 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -1898,8 +1898,8 @@ void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl) spin_lock_irqsave(&dd->ipath_sdma_lock, flags); skip_cancel = - !test_bit(IPATH_SDMA_DISABLED, statp) && - test_and_set_bit(IPATH_SDMA_ABORTING, statp); + test_and_set_bit(IPATH_SDMA_ABORTING, statp) + && !test_bit(IPATH_SDMA_DISABLED, statp); spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); if (skip_cancel) goto bail; @@ -1930,6 +1930,9 @@ void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl) ipath_disarm_piobufs(dd, 0, dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status); + if (restore_sendctrl) { /* else done by caller later if needed */ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); @@ -1949,7 +1952,6 @@ void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl) /* only wait so long for intr */ dd->ipath_sdma_abort_intr_timeout = jiffies + HZ; dd->ipath_sdma_reset_wait = 200; - __set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status); if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) tasklet_hi_schedule(&dd->ipath_sdma_abort_task); spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c index 0d07682c731..3697449c1ba 100644 --- a/drivers/infiniband/hw/ipath/ipath_sdma.c +++ b/drivers/infiniband/hw/ipath/ipath_sdma.c @@ -308,13 +308,15 @@ static void sdma_abort_task(unsigned long opaque) spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); /* - * Don't restart sdma here. Wait until link is up to ACTIVE. - * VL15 MADs used to bring the link up use PIO, and multiple - * link transitions otherwise cause the sdma engine to be + * Don't restart sdma here (with the exception + * below). Wait until link is up to ACTIVE. VL15 MADs + * used to bring the link up use PIO, and multiple link + * transitions otherwise cause the sdma engine to be * stopped and started multiple times. - * The disable is done here, including the shadow, so the - * state is kept consistent. - * See ipath_restart_sdma() for the actual starting of sdma. + * The disable is done here, including the shadow, + * so the state is kept consistent. + * See ipath_restart_sdma() for the actual starting + * of sdma. */ spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE; @@ -326,6 +328,13 @@ static void sdma_abort_task(unsigned long opaque) /* make sure I see next message */ dd->ipath_sdma_abort_jiffies = 0; + /* + * Not everything that takes SDMA offline is a link + * status change. If the link was up, restart SDMA. + */ + if (dd->ipath_flags & IPATH_LINKACTIVE) + ipath_restart_sdma(dd); + goto done; } @@ -427,7 +436,12 @@ int setup_sdma(struct ipath_devdata *dd) goto done; } - dd->ipath_sdma_status = 0; + /* + * Set initial status as if we had been up, then gone down. + * This lets initial start on transition to ACTIVE be the + * same as restart after link flap. + */ + dd->ipath_sdma_status = IPATH_SDMA_ABORT_ABORTED; dd->ipath_sdma_abort_jiffies = 0; dd->ipath_sdma_generation = 0; dd->ipath_sdma_descq_tail = 0; @@ -618,6 +632,9 @@ void ipath_restart_sdma(struct ipath_devdata *dd) ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + /* notify upper layers */ + ipath_ib_piobufavail(dd->verbs_dev); + bail: return; } -- cgit v1.2.3-18-g5258 From 12137c593d127c6c1a3eb050674da047682badaf Mon Sep 17 00:00:00 2001 From: Stefan Roscher Date: Wed, 7 May 2008 11:35:06 -0700 Subject: IB/ehca: Wait for async events to finish before destroying QP This is necessary because, in a multicore environmen