X-Git-Url: http://git.onelab.eu/?a=blobdiff_plain;f=drivers%2Finfiniband%2Fhw%2Fmthca%2Fmthca_cq.c;h=1159c8a0f2c5211d3b9b6f9f2a2b59ea8de6872b;hb=97bf2856c6014879bd04983a3e9dfcdac1e7fe85;hp=38f6d14e916fb4ae7f8f1c9027a588f9967d8898;hpb=87fc8d1bb10cd459024a742c6a10961fefcef18f;p=linux-2.6.git diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c index 38f6d14e9..1159c8a0f 100644 --- a/drivers/infiniband/hw/mthca/mthca_cq.c +++ b/drivers/infiniband/hw/mthca/mthca_cq.c @@ -1,5 +1,9 @@ /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005, 2006 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -32,12 +36,15 @@ * $Id: mthca_cq.c 1369 2004-12-20 16:17:07Z roland $ */ -#include +#include -#include +#include + +#include #include "mthca_dev.h" #include "mthca_cmd.h" +#include "mthca_memfree.h" enum { MTHCA_MAX_DIRECT_CQ_SIZE = 4 * PAGE_SIZE @@ -47,23 +54,29 @@ enum { MTHCA_CQ_ENTRY_SIZE = 0x20 }; +enum { + MTHCA_ATOMIC_BYTE_LEN = 8 +}; + /* * Must be packed because start is 64 bits but only aligned to 32 bits. */ struct mthca_cq_context { - u32 flags; - u64 start; - u32 logsize_usrpage; - u32 error_eqn; - u32 comp_eqn; - u32 pd; - u32 lkey; - u32 last_notified_index; - u32 solicit_producer_index; - u32 consumer_index; - u32 producer_index; - u32 cqn; - u32 reserved[3]; + __be32 flags; + __be64 start; + __be32 logsize_usrpage; + __be32 error_eqn; /* Tavor only */ + __be32 comp_eqn; + __be32 pd; + __be32 lkey; + __be32 last_notified_index; + __be32 solicit_producer_index; + __be32 consumer_index; + __be32 producer_index; + __be32 cqn; + __be32 ci_db; /* Arbel only */ + __be32 state_db; /* Arbel only */ + u32 reserved; } __attribute__((packed)); #define MTHCA_CQ_STATUS_OK ( 0 << 28) @@ -102,116 +115,176 @@ enum { }; struct mthca_cqe { - u32 my_qpn; - u32 my_ee; - u32 rqpn; - u16 sl_g_mlpath; - u16 rlid; - u32 imm_etype_pkey_eec; - u32 byte_cnt; - u32 wqe; - u8 opcode; - u8 is_send; - u8 reserved; - u8 owner; + __be32 my_qpn; + __be32 my_ee; + __be32 rqpn; + __be16 sl_g_mlpath; + __be16 rlid; + __be32 imm_etype_pkey_eec; + __be32 byte_cnt; + __be32 wqe; + u8 opcode; + u8 is_send; + u8 reserved; + u8 owner; }; struct mthca_err_cqe { - u32 my_qpn; - u32 reserved1[3]; - u8 syndrome; - u8 reserved2; - u16 db_cnt; - u32 reserved3; - u32 wqe; - u8 opcode; - u8 reserved4[2]; - u8 owner; + __be32 my_qpn; + u32 reserved1[3]; + u8 syndrome; + u8 vendor_err; + __be16 db_cnt; + u32 reserved2; + __be32 wqe; + u8 opcode; + u8 reserved3[2]; + u8 owner; }; #define MTHCA_CQ_ENTRY_OWNER_SW (0 << 7) #define MTHCA_CQ_ENTRY_OWNER_HW (1 << 7) -#define MTHCA_CQ_DB_INC_CI (1 << 24) -#define MTHCA_CQ_DB_REQ_NOT (2 << 24) -#define MTHCA_CQ_DB_REQ_NOT_SOL (3 << 24) -#define MTHCA_CQ_DB_SET_CI (4 << 24) -#define MTHCA_CQ_DB_REQ_NOT_MULT (5 << 24) +#define MTHCA_TAVOR_CQ_DB_INC_CI (1 << 24) +#define MTHCA_TAVOR_CQ_DB_REQ_NOT (2 << 24) +#define MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL (3 << 24) +#define MTHCA_TAVOR_CQ_DB_SET_CI (4 << 24) +#define MTHCA_TAVOR_CQ_DB_REQ_NOT_MULT (5 << 24) -static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry) +#define MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL (1 << 24) +#define MTHCA_ARBEL_CQ_DB_REQ_NOT (2 << 24) +#define MTHCA_ARBEL_CQ_DB_REQ_NOT_MULT (3 << 24) + +static inline struct mthca_cqe *get_cqe_from_buf(struct mthca_cq_buf *buf, + int entry) { - if (cq->is_direct) - return cq->queue.direct.buf + (entry * MTHCA_CQ_ENTRY_SIZE); + if (buf->is_direct) + return buf->queue.direct.buf + (entry * MTHCA_CQ_ENTRY_SIZE); else - return cq->queue.page_list[entry * MTHCA_CQ_ENTRY_SIZE / PAGE_SIZE].buf + return buf->queue.page_list[entry * MTHCA_CQ_ENTRY_SIZE / PAGE_SIZE].buf + (entry * MTHCA_CQ_ENTRY_SIZE) % PAGE_SIZE; } -static inline int cqe_sw(struct mthca_cq *cq, int i) +static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry) { - return !(MTHCA_CQ_ENTRY_OWNER_HW & - get_cqe(cq, i)->owner); + return get_cqe_from_buf(&cq->buf, entry); } -static inline int next_cqe_sw(struct mthca_cq *cq) +static inline struct mthca_cqe *cqe_sw(struct mthca_cqe *cqe) { - return cqe_sw(cq, cq->cons_index); + return MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner ? NULL : cqe; } -static inline void set_cqe_hw(struct mthca_cq *cq, int entry) +static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq) { - get_cqe(cq, entry)->owner = MTHCA_CQ_ENTRY_OWNER_HW; + return cqe_sw(get_cqe(cq, cq->cons_index & cq->ibcq.cqe)); } -static inline void inc_cons_index(struct mthca_dev *dev, struct mthca_cq *cq, - int nent) +static inline void set_cqe_hw(struct mthca_cqe *cqe) { - u32 doorbell[2]; + cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW; +} - doorbell[0] = cpu_to_be32(MTHCA_CQ_DB_INC_CI | cq->cqn); - doorbell[1] = cpu_to_be32(nent - 1); +static void dump_cqe(struct mthca_dev *dev, void *cqe_ptr) +{ + __be32 *cqe = cqe_ptr; - mthca_write64(doorbell, - dev->kar + MTHCA_CQ_DOORBELL, - MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + (void) cqe; /* avoid warning if mthca_dbg compiled away... */ + mthca_dbg(dev, "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n", + be32_to_cpu(cqe[0]), be32_to_cpu(cqe[1]), be32_to_cpu(cqe[2]), + be32_to_cpu(cqe[3]), be32_to_cpu(cqe[4]), be32_to_cpu(cqe[5]), + be32_to_cpu(cqe[6]), be32_to_cpu(cqe[7])); +} + +/* + * incr is ignored in native Arbel (mem-free) mode, so cq->cons_index + * should be correct before calling update_cons_index(). + */ +static inline void update_cons_index(struct mthca_dev *dev, struct mthca_cq *cq, + int incr) +{ + __be32 doorbell[2]; + + if (mthca_is_memfree(dev)) { + *cq->set_ci_db = cpu_to_be32(cq->cons_index); + wmb(); + } else { + doorbell[0] = cpu_to_be32(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn); + doorbell[1] = cpu_to_be32(incr - 1); + + mthca_write64(doorbell, + dev->kar + MTHCA_CQ_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + /* + * Make sure doorbells don't leak out of CQ spinlock + * and reach the HCA out of order: + */ + mmiowb(); + } } -void mthca_cq_event(struct mthca_dev *dev, u32 cqn) +void mthca_cq_completion(struct mthca_dev *dev, u32 cqn) { struct mthca_cq *cq; - spin_lock(&dev->cq_table.lock); cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1)); - if (cq) - atomic_inc(&cq->refcount); - spin_unlock(&dev->cq_table.lock); if (!cq) { mthca_warn(dev, "Completion event for bogus CQ %08x\n", cqn); return; } - cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + ++cq->arm_sn; - if (atomic_dec_and_test(&cq->refcount)) - wake_up(&cq->wait); + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); } -void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn) +void mthca_cq_event(struct mthca_dev *dev, u32 cqn, + enum ib_event_type event_type) { struct mthca_cq *cq; - struct mthca_cqe *cqe; - int prod_index; - int nfreed = 0; + struct ib_event event; + + spin_lock(&dev->cq_table.lock); - spin_lock_irq(&dev->cq_table.lock); cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1)); if (cq) - atomic_inc(&cq->refcount); - spin_unlock_irq(&dev->cq_table.lock); + ++cq->refcount; + + spin_unlock(&dev->cq_table.lock); - if (!cq) + if (!cq) { + mthca_warn(dev, "Async event for bogus CQ %08x\n", cqn); return; + } + + event.device = &dev->ib_dev; + event.event = event_type; + event.element.cq = &cq->ibcq; + if (cq->ibcq.event_handler) + cq->ibcq.event_handler(&event, cq->ibcq.cq_context); + + spin_lock(&dev->cq_table.lock); + if (!--cq->refcount) + wake_up(&cq->wait); + spin_unlock(&dev->cq_table.lock); +} + +static inline int is_recv_cqe(struct mthca_cqe *cqe) +{ + if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) == + MTHCA_ERROR_CQE_OPCODE_MASK) + return !(cqe->opcode & 0x01); + else + return !(cqe->is_send & 0x80); +} + +void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn, + struct mthca_srq *srq) +{ + struct mthca_cqe *cqe; + u32 prod_index; + int nfreed = 0; spin_lock_irq(&cq->lock); @@ -223,66 +296,105 @@ void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn) * from our QP and therefore don't need to be checked. */ for (prod_index = cq->cons_index; - cqe_sw(cq, prod_index & cq->ibcq.cqe); + cqe_sw(get_cqe(cq, prod_index & cq->ibcq.cqe)); ++prod_index) if (prod_index == cq->cons_index + cq->ibcq.cqe) break; if (0) mthca_dbg(dev, "Cleaning QPN %06x from CQN %06x; ci %d, pi %d\n", - qpn, cqn, cq->cons_index, prod_index); + qpn, cq->cqn, cq->cons_index, prod_index); /* * Now sweep backwards through the CQ, removing CQ entries * that match our QP by copying older entries on top of them. */ - while (prod_index > cq->cons_index) { - cqe = get_cqe(cq, (prod_index - 1) & cq->ibcq.cqe); - if (cqe->my_qpn == cpu_to_be32(qpn)) + while ((int) --prod_index - (int) cq->cons_index >= 0) { + cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); + if (cqe->my_qpn == cpu_to_be32(qpn)) { + if (srq && is_recv_cqe(cqe)) + mthca_free_srq_wqe(srq, be32_to_cpu(cqe->wqe)); ++nfreed; - else if (nfreed) - memcpy(get_cqe(cq, (prod_index - 1 + nfreed) & - cq->ibcq.cqe), - cqe, - MTHCA_CQ_ENTRY_SIZE); - --prod_index; + } else if (nfreed) + memcpy(get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe), + cqe, MTHCA_CQ_ENTRY_SIZE); } if (nfreed) { wmb(); - inc_cons_index(dev, cq, nfreed); - cq->cons_index = (cq->cons_index + nfreed) & cq->ibcq.cqe; + cq->cons_index += nfreed; + update_cons_index(dev, cq, nfreed); } spin_unlock_irq(&cq->lock); - if (atomic_dec_and_test(&cq->refcount)) - wake_up(&cq->wait); } -static int handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq, - struct mthca_qp *qp, int wqe_index, int is_send, - struct mthca_err_cqe *cqe, - struct ib_wc *entry, int *free_cqe) +void mthca_cq_resize_copy_cqes(struct mthca_cq *cq) { - int err; - int dbd; - u32 new_wqe; + int i; - if (1 && cqe->syndrome != SYNDROME_WR_FLUSH_ERR) { - int j; + /* + * In Tavor mode, the hardware keeps the consumer and producer + * indices mod the CQ size. Since we might be making the CQ + * bigger, we need to deal with the case where the producer + * index wrapped around before the CQ was resized. + */ + if (!mthca_is_memfree(to_mdev(cq->ibcq.device)) && + cq->ibcq.cqe < cq->resize_buf->cqe) { + cq->cons_index &= cq->ibcq.cqe; + if (cqe_sw(get_cqe(cq, cq->ibcq.cqe))) + cq->cons_index -= cq->ibcq.cqe + 1; + } - mthca_dbg(dev, "%x/%d: error CQE -> QPN %06x, WQE @ %08x\n", - cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn), - be32_to_cpu(cqe->wqe)); + for (i = cq->cons_index; cqe_sw(get_cqe(cq, i & cq->ibcq.cqe)); ++i) + memcpy(get_cqe_from_buf(&cq->resize_buf->buf, + i & cq->resize_buf->cqe), + get_cqe(cq, i & cq->ibcq.cqe), MTHCA_CQ_ENTRY_SIZE); +} - for (j = 0; j < 8; ++j) - printk(KERN_DEBUG " [%2x] %08x\n", - j * 4, be32_to_cpu(((u32 *) cqe)[j])); +int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent) +{ + int ret; + int i; + + ret = mthca_buf_alloc(dev, nent * MTHCA_CQ_ENTRY_SIZE, + MTHCA_MAX_DIRECT_CQ_SIZE, + &buf->queue, &buf->is_direct, + &dev->driver_pd, 1, &buf->mr); + if (ret) + return ret; + + for (i = 0; i < nent; ++i) + set_cqe_hw(get_cqe_from_buf(buf, i)); + + return 0; +} + +void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe) +{ + mthca_buf_free(dev, (cqe + 1) * MTHCA_CQ_ENTRY_SIZE, &buf->queue, + buf->is_direct, &buf->mr); +} + +static void handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq, + struct mthca_qp *qp, int wqe_index, int is_send, + struct mthca_err_cqe *cqe, + struct ib_wc *entry, int *free_cqe) +{ + int dbd; + __be32 new_wqe; + + if (cqe->syndrome == SYNDROME_LOCAL_QP_OP_ERR) { + mthca_dbg(dev, "local QP operation err " + "(QPN %06x, WQE @ %08x, CQN %06x, index %d)\n", + be32_to_cpu(cqe->my_qpn), be32_to_cpu(cqe->wqe), + cq->cqn, cq->cons_index); + dump_cqe(dev, cqe); } /* - * For completions in error, only work request ID, status (and - * freed resource count for RD) have to be set. + * For completions in error, only work request ID, status, vendor error + * (and freed resource count for RD) have to be set. */ switch (cqe->syndrome) { case SYNDROME_LOCAL_LENGTH_ERR: @@ -344,9 +456,16 @@ static int handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq, break; } - err = mthca_free_err_wqe(qp, is_send, wqe_index, &dbd, &new_wqe); - if (err) - return err; + entry->vendor_err = cqe->vendor_err; + + /* + * Mem-free HCAs always generate one CQE per WQE, even in the + * error case, so we don't have to check the doorbell count, etc. + */ + if (mthca_is_memfree(dev)) + return; + + mthca_free_err_wqe(dev, qp, is_send, wqe_index, &dbd, &new_wqe); /* * If we're at the end of the WQE chain, or we've used up our @@ -354,24 +473,13 @@ static int handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq, * the next poll operation. */ if (!(new_wqe & cpu_to_be32(0x3f)) || (!cqe->db_cnt && dbd)) - return 0; + return; cqe->db_cnt = cpu_to_be16(be16_to_cpu(cqe->db_cnt) - dbd); cqe->wqe = new_wqe; cqe->syndrome = SYNDROME_WR_FLUSH_ERR; *free_cqe = 0; - - return 0; -} - -static void dump_cqe(struct mthca_cqe *cqe) -{ - int j; - - for (j = 0; j < 8; ++j) - printk(KERN_DEBUG " [%2x] %08x\n", - j * 4, be32_to_cpu(((u32 *) cqe)[j])); } static inline int mthca_poll_one(struct mthca_dev *dev, @@ -383,12 +491,13 @@ static inline int mthca_poll_one(struct mthca_dev *dev, struct mthca_wq *wq; struct mthca_cqe *cqe; int wqe_index; - int is_error = 0; + int is_error; int is_send; int free_cqe = 1; int err = 0; - if (!next_cqe_sw(cq)) + cqe = next_cqe_sw(cq); + if (!cqe) return -EAGAIN; /* @@ -397,49 +506,32 @@ static inline int mthca_poll_one(struct mthca_dev *dev, */ rmb(); - cqe = get_cqe(cq, cq->cons_index); - if (0) { mthca_dbg(dev, "%x/%d: CQE -> QPN %06x, WQE @ %08x\n", cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn), be32_to_cpu(cqe->wqe)); - - dump_cqe(cqe); + dump_cqe(dev, cqe); } - if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) == - MTHCA_ERROR_CQE_OPCODE_MASK) { - is_error = 1; - is_send = cqe->opcode & 1; - } else - is_send = cqe->is_send & 0x80; + is_error = (cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) == + MTHCA_ERROR_CQE_OPCODE_MASK; + is_send = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80; if (!*cur_qp || be32_to_cpu(cqe->my_qpn) != (*cur_qp)->qpn) { - if (*cur_qp) { - if (*freed) { - wmb(); - inc_cons_index(dev, cq, *freed); - *freed = 0; - } - spin_unlock(&(*cur_qp)->lock); - } - - spin_lock(&dev->qp_table.lock); + /* + * We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ *cur_qp = mthca_array_get(&dev->qp_table.qp, be32_to_cpu(cqe->my_qpn) & (dev->limits.num_qps - 1)); - if (*cur_qp) - atomic_inc(&(*cur_qp)->refcount); - spin_unlock(&dev->qp_table.lock); - if (!*cur_qp) { mthca_warn(dev, "CQ entry for unknown QP %06x\n", be32_to_cpu(cqe->my_qpn) & 0xffffff); err = -EINVAL; goto out; } - - spin_lock(&(*cur_qp)->lock); } entry->qp_num = (*cur_qp)->qpn; @@ -450,33 +542,80 @@ static inline int mthca_poll_one(struct mthca_dev *dev, >> wq->wqe_shift); entry->wr_id = (*cur_qp)->wrid[wqe_index + (*cur_qp)->rq.max]; + } else if ((*cur_qp)->ibqp.srq) { + struct mthca_srq *srq = to_msrq((*cur_qp)->ibqp.srq); + u32 wqe = be32_to_cpu(cqe->wqe); + wq = NULL; + wqe_index = wqe >> srq->wqe_shift; + entry->wr_id = srq->wrid[wqe_index]; + mthca_free_srq_wqe(srq, wqe); } else { + s32 wqe; wq = &(*cur_qp)->rq; - wqe_index = be32_to_cpu(cqe->wqe) >> wq->wqe_shift; + wqe = be32_to_cpu(cqe->wqe); + wqe_index = wqe >> wq->wqe_shift; + /* + * WQE addr == base - 1 might be reported in receive completion + * with error instead of (rq size - 1) by Sinai FW 1.0.800 and + * Arbel FW 5.1.400. This bug should be fixed in later FW revs. + */ + if (unlikely(wqe_index < 0)) + wqe_index = wq->max - 1; entry->wr_id = (*cur_qp)->wrid[wqe_index]; } - if (wq->last_comp < wqe_index) - wq->cur -= wqe_index - wq->last_comp; - else - wq->cur -= wq->max - wq->last_comp + wqe_index; - - wq->last_comp = wqe_index; + if (wq) { + if (wq->last_comp < wqe_index) + wq->tail += wqe_index - wq->last_comp; + else + wq->tail += wqe_index + wq->max - wq->last_comp; - if (0) - mthca_dbg(dev, "%s completion for QP %06x, index %d (nr %d)\n", - is_send ? "Send" : "Receive", - (*cur_qp)->qpn, wqe_index, wq->max); + wq->last_comp = wqe_index; + } if (is_error) { - err = handle_error_cqe(dev, cq, *cur_qp, wqe_index, is_send, - (struct mthca_err_cqe *) cqe, - entry, &free_cqe); + handle_error_cqe(dev, cq, *cur_qp, wqe_index, is_send, + (struct mthca_err_cqe *) cqe, + entry, &free_cqe); goto out; } if (is_send) { - entry->opcode = IB_WC_SEND; /* XXX */ + entry->wc_flags = 0; + switch (cqe->opcode) { + case MTHCA_OPCODE_RDMA_WRITE: + entry->opcode = IB_WC_RDMA_WRITE; + break; + case MTHCA_OPCODE_RDMA_WRITE_IMM: + entry->opcode = IB_WC_RDMA_WRITE; + entry->wc_flags |= IB_WC_WITH_IMM; + break; + case MTHCA_OPCODE_SEND: + entry->opcode = IB_WC_SEND; + break; + case MTHCA_OPCODE_SEND_IMM: + entry->opcode = IB_WC_SEND; + entry->wc_flags |= IB_WC_WITH_IMM; + break; + case MTHCA_OPCODE_RDMA_READ: + entry->opcode = IB_WC_RDMA_READ; + entry->byte_len = be32_to_cpu(cqe->byte_cnt); + break; + case MTHCA_OPCODE_ATOMIC_CS: + entry->opcode = IB_WC_COMP_SWAP; + entry->byte_len = MTHCA_ATOMIC_BYTE_LEN; + break; + case MTHCA_OPCODE_ATOMIC_FA: + entry->opcode = IB_WC_FETCH_ADD; + entry->byte_len = MTHCA_ATOMIC_BYTE_LEN; + break; + case MTHCA_OPCODE_BIND_MW: + entry->opcode = IB_WC_BIND_MW; + break; + default: + entry->opcode = MTHCA_OPCODE_INVALID; + break; + } } else { entry->byte_len = be32_to_cpu(cqe->byte_cnt); switch (cqe->opcode & 0x1f) { @@ -509,10 +648,10 @@ static inline int mthca_poll_one(struct mthca_dev *dev, entry->status = IB_WC_SUCCESS; out: - if (free_cqe) { - set_cqe_hw(cq, cq->cons_index); + if (likely(free_cqe)) { + set_cqe_hw(cqe); ++(*freed); - cq->cons_index = (cq->cons_index + 1) & cq->ibcq.cqe; + ++cq->cons_index; } return err; @@ -531,163 +670,191 @@ int mthca_poll_cq(struct ib_cq *ibcq, int num_entries, spin_lock_irqsave(&cq->lock, flags); - for (npolled = 0; npolled < num_entries; ++npolled) { + npolled = 0; +repoll: + while (npolled < num_entries) { err = mthca_poll_one(dev, cq, &qp, &freed, entry + npolled); if (err) break; + ++npolled; } if (freed) { wmb(); - inc_cons_index(dev, cq, freed); + update_cons_index(dev, cq, freed); } - if (qp) { - spin_unlock(&qp->lock); - if (atomic_dec_and_test(&qp->refcount)) - wake_up(&qp->wait); + /* + * If a CQ resize is in progress and we discovered that the + * old buffer is empty, then peek in the new buffer, and if + * it's not empty, switch to the new buffer and continue + * polling there. + */ + if (unlikely(err == -EAGAIN && cq->resize_buf && + cq->resize_buf->state == CQ_RESIZE_READY)) { + /* + * In Tavor mode, the hardware keeps the producer + * index modulo the CQ size. Since we might be making + * the CQ bigger, we need to mask our consumer index + * using the size of the old CQ buffer before looking + * in the new CQ buffer. + */ + if (!mthca_is_memfree(dev)) + cq->cons_index &= cq->ibcq.cqe; + + if (cqe_sw(get_cqe_from_buf(&cq->resize_buf->buf, + cq->cons_index & cq->resize_buf->cqe))) { + struct mthca_cq_buf tbuf; + int tcqe; + + tbuf = cq->buf; + tcqe = cq->ibcq.cqe; + cq->buf = cq->resize_buf->buf; + cq->ibcq.cqe = cq->resize_buf->cqe; + + cq->resize_buf->buf = tbuf; + cq->resize_buf->cqe = tcqe; + cq->resize_buf->state = CQ_RESIZE_SWAPPED; + + goto repoll; + } } - spin_unlock_irqrestore(&cq->lock, flags); return err == 0 || err == -EAGAIN ? npolled : err; } -void mthca_arm_cq(struct mthca_dev *dev, struct mthca_cq *cq, - int solicited) +int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify notify) { - u32 doorbell[2]; + __be32 doorbell[2]; - doorbell[0] = cpu_to_be32((solicited ? - MTHCA_CQ_DB_REQ_NOT_SOL : - MTHCA_CQ_DB_REQ_NOT) | - cq->cqn); - doorbell[1] = 0xffffffff; + doorbell[0] = cpu_to_be32((notify == IB_CQ_SOLICITED ? + MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL : + MTHCA_TAVOR_CQ_DB_REQ_NOT) | + to_mcq(cq)->cqn); + doorbell[1] = (__force __be32) 0xffffffff; mthca_write64(doorbell, - dev->kar + MTHCA_CQ_DOORBELL, - MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); + to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&to_mdev(cq->device)->doorbell_lock)); + + return 0; } -int mthca_init_cq(struct mthca_dev *dev, int nent, - struct mthca_cq *cq) +int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify) { - int size = nent * MTHCA_CQ_ENTRY_SIZE; - dma_addr_t t; - void *mailbox = NULL; - int npages, shift; - u64 *dma_list = NULL; - struct mthca_cq_context *cq_context; - int err = -ENOMEM; - u8 status; - int i; - - might_sleep(); - - mailbox = kmalloc(sizeof (struct mthca_cq_context) + MTHCA_CMD_MAILBOX_EXTRA, - GFP_KERNEL); - if (!mailbox) - goto err_out; - - cq_context = MAILBOX_ALIGN(mailbox); + struct mthca_cq *cq = to_mcq(ibcq); + __be32 doorbell[2]; + u32 sn; + __be32 ci; - if (size <= MTHCA_MAX_DIRECT_CQ_SIZE) { - if (0) - mthca_dbg(dev, "Creating direct CQ of size %d\n", size); + sn = cq->arm_sn & 3; + ci = cpu_to_be32(cq->cons_index); - cq->is_direct = 1; - npages = 1; - shift = get_order(size) + PAGE_SHIFT; + doorbell[0] = ci; + doorbell[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) | + (notify == IB_CQ_SOLICITED ? 1 : 2)); - cq->queue.direct.buf = pci_alloc_consistent(dev->pdev, - size, &t); - if (!cq->queue.direct.buf) - goto err_out; + mthca_write_db_rec(doorbell, cq->arm_db); - pci_unmap_addr_set(&cq->queue.direct, mapping, t); + /* + * Make sure that the doorbell record in host memory is + * written before ringing the doorbell via PCI MMIO. + */ + wmb(); - memset(cq->queue.direct.buf, 0, size); + doorbell[0] = cpu_to_be32((sn << 28) | + (notify == IB_CQ_SOLICITED ? + MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL : + MTHCA_ARBEL_CQ_DB_REQ_NOT) | + cq->cqn); + doorbell[1] = ci; - while (t & ((1 << shift) - 1)) { - --shift; - npages *= 2; - } + mthca_write64(doorbell, + to_mdev(ibcq->device)->kar + MTHCA_CQ_DOORBELL, + MTHCA_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->doorbell_lock)); - dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL); - if (!dma_list) - goto err_out_free; + return 0; +} - for (i = 0; i < npages; ++i) - dma_list[i] = t + i * (1 << shift); - } else { - cq->is_direct = 0; - npages = (size + PAGE_SIZE - 1) / PAGE_SIZE; - shift = PAGE_SHIFT; +int mthca_init_cq(struct mthca_dev *dev, int nent, + struct mthca_ucontext *ctx, u32 pdn, + struct mthca_cq *cq) +{ + struct mthca_mailbox *mailbox; + struct mthca_cq_context *cq_context; + int err = -ENOMEM; + u8 status; - if (0) - mthca_dbg(dev, "Creating indirect CQ with %d pages\n", npages); + cq->ibcq.cqe = nent - 1; + cq->is_kernel = !ctx; - dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL); - if (!dma_list) - goto err_out; + cq->cqn = mthca_alloc(&dev->cq_table.alloc); + if (cq->cqn == -1) + return -ENOMEM; - cq->queue.page_list = kmalloc(npages * sizeof *cq->queue.page_list, - GFP_KERNEL); - if (!cq->queue.page_list) + if (mthca_is_memfree(dev)) { + err = mthca_table_get(dev, dev->cq_table.table, cq->cqn); + if (err) goto err_out; - for (i = 0; i < npages; ++i) - cq->queue.page_list[i].buf = NULL; + if (cq->is_kernel) { + cq->arm_sn = 1; - for (i = 0; i < npages; ++i) { - cq->queue.page_list[i].buf = - pci_alloc_consistent(dev->pdev, PAGE_SIZE, &t); - if (!cq->queue.page_list[i].buf) - goto err_out_free; + err = -ENOMEM; - dma_list[i] = t; - pci_unmap_addr_set(&cq->queue.page_list[i], mapping, t); + cq->set_ci_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, + cq->cqn, &cq->set_ci_db); + if (cq->set_ci_db_index < 0) + goto err_out_icm; - memset(cq->queue.page_list[i].buf, 0, PAGE_SIZE); + cq->arm_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_ARM, + cq->cqn, &cq->arm_db); + if (cq->arm_db_index < 0) + goto err_out_ci; } } - for (i = 0; i < nent; ++i) - set_cqe_hw(cq, i); + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) + goto err_out_arm; - cq->cqn = mthca_alloc(&dev->cq_table.alloc); - if (cq->cqn == -1) - goto err_out_free; - - err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num, - dma_list, shift, npages, - 0, size, - MTHCA_MPT_FLAG_LOCAL_WRITE | - MTHCA_MPT_FLAG_LOCAL_READ, - &cq->mr); - if (err) - goto err_out_free_cq; + cq_context = mailbox->buf; + + if (cq->is_kernel) { + err = mthca_alloc_cq_buf(dev, &cq->buf, nent); + if (err) + goto err_out_mailbox; + } spin_lock_init(&cq->lock); - atomic_set(&cq->refcount, 1); + cq->refcount = 1; init_waitqueue_head(&cq->wait); + mutex_init(&cq->mutex); memset(cq_context, 0, sizeof *cq_context); cq_context->flags = cpu_to_be32(MTHCA_CQ_STATUS_OK | MTHCA_CQ_STATE_DISARMED | MTHCA_CQ_FLAG_TR); - cq_context->start = cpu_to_be64(0); - cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24 | - MTHCA_KAR_PAGE); + cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24); + if (ctx) + cq_context->logsize_usrpage |= cpu_to_be32(ctx->uar.index); + else + cq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index); cq_context->error_eqn = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn); cq_context->comp_eqn = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_COMP].eqn); - cq_context->pd = cpu_to_be32(dev->driver_pd.pd_num); - cq_context->lkey = cpu_to_be32(cq->mr.ibmr.lkey); + cq_context->pd = cpu_to_be32(pdn); + cq_context->lkey = cpu_to_be32(cq->buf.mr.ibmr.lkey); cq_context->cqn = cpu_to_be32(cq->cqn); - err = mthca_SW2HW_CQ(dev, cq_context, cq->cqn, &status); + if (mthca_is_memfree(dev)) { + cq_context->ci_db = cpu_to_be32(cq->set_ci_db_index); + cq_context->state_db = cpu_to_be32(cq->arm_db_index); + } + + err = mthca_SW2HW_CQ(dev, mailbox, cq->cqn, &status); if (err) { mthca_warn(dev, "SW2HW_CQ failed (%d)\n", err); goto err_out_free_mr; @@ -711,69 +878,71 @@ int mthca_init_cq(struct mthca_dev *dev, int nent, cq->cons_index = 0; - kfree(dma_list); - kfree(mailbox); + mthca_free_mailbox(dev, mailbox); return 0; - err_out_free_mr: - mthca_free_mr(dev, &cq->mr); +err_out_free_mr: + if (cq->is_kernel) + mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); - err_out_free_cq: - mthca_free(&dev->cq_table.alloc, cq->cqn); +err_out_mailbox: + mthca_free_mailbox(dev, mailbox); - err_out_free: - if (cq->is_direct) - pci_free_consistent(dev->pdev, size, - cq->queue.direct.buf, - pci_unmap_addr(&cq->queue.direct, mapping)); - else { - for (i = 0; i < npages; ++i) - if (cq->queue.page_list[i].buf) - pci_free_consistent(dev->pdev, PAGE_SIZE, - cq->queue.page_list[i].buf, - pci_unmap_addr(&cq->queue.page_list[i], - mapping)); - - kfree(cq->queue.page_list); - } +err_out_arm: + if (cq->is_kernel && mthca_is_memfree(dev)) + mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index); + +err_out_ci: + if (cq->is_kernel && mthca_is_memfree(dev)) + mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index); - err_out: - kfree(dma_list); - kfree(mailbox); +err_out_icm: + mthca_table_put(dev, dev->cq_table.table, cq->cqn); + +err_out: + mthca_free(&dev->cq_table.alloc, cq->cqn); return err; } +static inline int get_cq_refcount(struct mthca_dev *dev, struct mthca_cq *cq) +{ + int c; + + spin_lock_irq(&dev->cq_table.lock); + c = cq->refcount; + spin_unlock_irq(&dev->cq_table.lock); + + return c; +} + void mthca_free_cq(struct mthca_dev *dev, struct mthca_cq *cq) { - void *mailbox; + struct mthca_mailbox *mailbox; int err; u8 status; - might_sleep(); - - mailbox = kmalloc(sizeof (struct mthca_cq_context) + MTHCA_CMD_MAILBOX_EXTRA, - GFP_KERNEL); - if (!mailbox) { + mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); + if (IS_ERR(mailbox)) { mthca_warn(dev, "No memory for mailbox to free CQ.\n"); return; } - err = mthca_HW2SW_CQ(dev, MAILBOX_ALIGN(mailbox), cq->cqn, &status); + err = mthca_HW2SW_CQ(dev, mailbox, cq->cqn, &status); if (err) mthca_warn(dev, "HW2SW_CQ failed (%d)\n", err); else if (status) - mthca_warn(dev, "HW2SW_CQ returned status 0x%02x\n", - status); + mthca_warn(dev, "HW2SW_CQ returned status 0x%02x\n", status); if (0) { - u32 *ctx = MAILBOX_ALIGN(mailbox); + __be32 *ctx = mailbox->buf; int j; printk(KERN_ERR "context for CQN %x (cons index %x, next sw %d)\n", - cq->cqn, cq->cons_index, next_cqe_sw(cq)); + cq->cqn, cq->cons_index, + cq->is_kernel ? !!next_cqe_sw(cq) : 0); for (j = 0; j < 16; ++j) printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j])); } @@ -781,39 +950,30 @@ void mthca_free_cq(struct mthca_dev *dev, spin_lock_irq(&dev->cq_table.lock); mthca_array_clear(&dev->cq_table.cq, cq->cqn & (dev->limits.num_cqs - 1)); + --cq->refcount; spin_unlock_irq(&dev->cq_table.lock); - atomic_dec(&cq->refcount); - wait_event(cq->wait, !atomic_read(&cq->refcount)); - - mthca_free_mr(dev, &cq->mr); - - if (cq->is_direct) - pci_free_consistent(dev->pdev, - (cq->ibcq.cqe + 1) * MTHCA_CQ_ENTRY_SIZE, - cq->queue.direct.buf, - pci_unmap_addr(&cq->queue.direct, - mapping)); - else { - int i; - - for (i = 0; - i < ((cq->ibcq.cqe + 1) * MTHCA_CQ_ENTRY_SIZE + PAGE_SIZE - 1) / - PAGE_SIZE; - ++i) - pci_free_consistent(dev->pdev, PAGE_SIZE, - cq->queue.page_list[i].buf, - pci_unmap_addr(&cq->queue.page_list[i], - mapping)); - - kfree(cq->queue.page_list); + if (dev->mthca_flags & MTHCA_FLAG_MSI_X) + synchronize_irq(dev->eq_table.eq[MTHCA_EQ_COMP].msi_x_vector); + else + synchronize_irq(dev->pdev->irq); + + wait_event(cq->wait, !get_cq_refcount(dev, cq)); + + if (cq->is_kernel) { + mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); + if (mthca_is_memfree(dev)) { + mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index); + mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index); + } } + mthca_table_put(dev, dev->cq_table.table, cq->cqn); mthca_free(&dev->cq_table.alloc, cq->cqn); - kfree(mailbox); + mthca_free_mailbox(dev, mailbox); } -int __devinit mthca_init_cq_table(struct mthca_dev *dev) +int mthca_init_cq_table(struct mthca_dev *dev) { int err; @@ -834,7 +994,7 @@ int __devinit mthca_init_cq_table(struct mthca_dev *dev) return err; } -void __devexit mthca_cleanup_cq_table(struct mthca_dev *dev) +void mthca_cleanup_cq_table(struct mthca_dev *dev) { mthca_array_cleanup(&dev->cq_table.cq, dev->limits.num_cqs); mthca_alloc_cleanup(&dev->cq_table.alloc);