This commit was manufactured by cvs2svn to create branch 'vserver'.
[linux-2.6.git] / drivers / infiniband / hw / mthca / mthca_cq.c
1 /*
2  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  *
32  * $Id: mthca_cq.c 1369 2004-12-20 16:17:07Z roland $
33  */
34
35 #include <linux/init.h>
36
37 #include <ib_pack.h>
38
39 #include "mthca_dev.h"
40 #include "mthca_cmd.h"
41
42 enum {
43         MTHCA_MAX_DIRECT_CQ_SIZE = 4 * PAGE_SIZE
44 };
45
46 enum {
47         MTHCA_CQ_ENTRY_SIZE = 0x20
48 };
49
50 /*
51  * Must be packed because start is 64 bits but only aligned to 32 bits.
52  */
53 struct mthca_cq_context {
54         u32 flags;
55         u64 start;
56         u32 logsize_usrpage;
57         u32 error_eqn;
58         u32 comp_eqn;
59         u32 pd;
60         u32 lkey;
61         u32 last_notified_index;
62         u32 solicit_producer_index;
63         u32 consumer_index;
64         u32 producer_index;
65         u32 cqn;
66         u32 reserved[3];
67 } __attribute__((packed));
68
69 #define MTHCA_CQ_STATUS_OK          ( 0 << 28)
70 #define MTHCA_CQ_STATUS_OVERFLOW    ( 9 << 28)
71 #define MTHCA_CQ_STATUS_WRITE_FAIL  (10 << 28)
72 #define MTHCA_CQ_FLAG_TR            ( 1 << 18)
73 #define MTHCA_CQ_FLAG_OI            ( 1 << 17)
74 #define MTHCA_CQ_STATE_DISARMED     ( 0 <<  8)
75 #define MTHCA_CQ_STATE_ARMED        ( 1 <<  8)
76 #define MTHCA_CQ_STATE_ARMED_SOL    ( 4 <<  8)
77 #define MTHCA_EQ_STATE_FIRED        (10 <<  8)
78
79 enum {
80         MTHCA_ERROR_CQE_OPCODE_MASK = 0xfe
81 };
82
83 enum {
84         SYNDROME_LOCAL_LENGTH_ERR        = 0x01,
85         SYNDROME_LOCAL_QP_OP_ERR         = 0x02,
86         SYNDROME_LOCAL_EEC_OP_ERR        = 0x03,
87         SYNDROME_LOCAL_PROT_ERR          = 0x04,
88         SYNDROME_WR_FLUSH_ERR            = 0x05,
89         SYNDROME_MW_BIND_ERR             = 0x06,
90         SYNDROME_BAD_RESP_ERR            = 0x10,
91         SYNDROME_LOCAL_ACCESS_ERR        = 0x11,
92         SYNDROME_REMOTE_INVAL_REQ_ERR    = 0x12,
93         SYNDROME_REMOTE_ACCESS_ERR       = 0x13,
94         SYNDROME_REMOTE_OP_ERR           = 0x14,
95         SYNDROME_RETRY_EXC_ERR           = 0x15,
96         SYNDROME_RNR_RETRY_EXC_ERR       = 0x16,
97         SYNDROME_LOCAL_RDD_VIOL_ERR      = 0x20,
98         SYNDROME_REMOTE_INVAL_RD_REQ_ERR = 0x21,
99         SYNDROME_REMOTE_ABORTED_ERR      = 0x22,
100         SYNDROME_INVAL_EECN_ERR          = 0x23,
101         SYNDROME_INVAL_EEC_STATE_ERR     = 0x24
102 };
103
104 struct mthca_cqe {
105         u32 my_qpn;
106         u32 my_ee;
107         u32 rqpn;
108         u16 sl_g_mlpath;
109         u16 rlid;
110         u32 imm_etype_pkey_eec;
111         u32 byte_cnt;
112         u32 wqe;
113         u8  opcode;
114         u8  is_send;
115         u8  reserved;
116         u8  owner;
117 };
118
119 struct mthca_err_cqe {
120         u32 my_qpn;
121         u32 reserved1[3];
122         u8  syndrome;
123         u8  reserved2;
124         u16 db_cnt;
125         u32 reserved3;
126         u32 wqe;
127         u8  opcode;
128         u8  reserved4[2];
129         u8  owner;
130 };
131
132 #define MTHCA_CQ_ENTRY_OWNER_SW      (0 << 7)
133 #define MTHCA_CQ_ENTRY_OWNER_HW      (1 << 7)
134
135 #define MTHCA_CQ_DB_INC_CI       (1 << 24)
136 #define MTHCA_CQ_DB_REQ_NOT      (2 << 24)
137 #define MTHCA_CQ_DB_REQ_NOT_SOL  (3 << 24)
138 #define MTHCA_CQ_DB_SET_CI       (4 << 24)
139 #define MTHCA_CQ_DB_REQ_NOT_MULT (5 << 24)
140
141 static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
142 {
143         if (cq->is_direct)
144                 return cq->queue.direct.buf + (entry * MTHCA_CQ_ENTRY_SIZE);
145         else
146                 return cq->queue.page_list[entry * MTHCA_CQ_ENTRY_SIZE / PAGE_SIZE].buf
147                         + (entry * MTHCA_CQ_ENTRY_SIZE) % PAGE_SIZE;
148 }
149
150 static inline int cqe_sw(struct mthca_cq *cq, int i)
151 {
152         return !(MTHCA_CQ_ENTRY_OWNER_HW &
153                  get_cqe(cq, i)->owner);
154 }
155
156 static inline int next_cqe_sw(struct mthca_cq *cq)
157 {
158         return cqe_sw(cq, cq->cons_index);
159 }
160
161 static inline void set_cqe_hw(struct mthca_cq *cq, int entry)
162 {
163         get_cqe(cq, entry)->owner = MTHCA_CQ_ENTRY_OWNER_HW;
164 }
165
166 static inline void inc_cons_index(struct mthca_dev *dev, struct mthca_cq *cq,
167                                   int nent)
168 {
169         u32 doorbell[2];
170
171         doorbell[0] = cpu_to_be32(MTHCA_CQ_DB_INC_CI | cq->cqn);
172         doorbell[1] = cpu_to_be32(nent - 1);
173
174         mthca_write64(doorbell,
175                       dev->kar + MTHCA_CQ_DOORBELL,
176                       MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
177 }
178
179 void mthca_cq_event(struct mthca_dev *dev, u32 cqn)
180 {
181         struct mthca_cq *cq;
182
183         spin_lock(&dev->cq_table.lock);
184         cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
185         if (cq)
186                 atomic_inc(&cq->refcount);
187         spin_unlock(&dev->cq_table.lock);
188
189         if (!cq) {
190                 mthca_warn(dev, "Completion event for bogus CQ %08x\n", cqn);
191                 return;
192         }
193
194         cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
195
196         if (atomic_dec_and_test(&cq->refcount))
197                 wake_up(&cq->wait);
198 }
199
200 void mthca_cq_clean(struct mthca_dev *dev, u32 cqn, u32 qpn)
201 {
202         struct mthca_cq *cq;
203         struct mthca_cqe *cqe;
204         int prod_index;
205         int nfreed = 0;
206
207         spin_lock_irq(&dev->cq_table.lock);
208         cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
209         if (cq)
210                 atomic_inc(&cq->refcount);
211         spin_unlock_irq(&dev->cq_table.lock);
212
213         if (!cq)
214                 return;
215
216         spin_lock_irq(&cq->lock);
217
218         /*
219          * First we need to find the current producer index, so we
220          * know where to start cleaning from.  It doesn't matter if HW
221          * adds new entries after this loop -- the QP we're worried
222          * about is already in RESET, so the new entries won't come
223          * from our QP and therefore don't need to be checked.
224          */
225         for (prod_index = cq->cons_index;
226              cqe_sw(cq, prod_index & cq->ibcq.cqe);
227              ++prod_index)
228                 if (prod_index == cq->cons_index + cq->ibcq.cqe)
229                         break;
230
231         if (0)
232                 mthca_dbg(dev, "Cleaning QPN %06x from CQN %06x; ci %d, pi %d\n",
233                           qpn, cqn, cq->cons_index, prod_index);
234
235         /*
236          * Now sweep backwards through the CQ, removing CQ entries
237          * that match our QP by copying older entries on top of them.
238          */
239         while (prod_index > cq->cons_index) {
240                 cqe = get_cqe(cq, (prod_index - 1) & cq->ibcq.cqe);
241                 if (cqe->my_qpn == cpu_to_be32(qpn))
242                         ++nfreed;
243                 else if (nfreed)
244                         memcpy(get_cqe(cq, (prod_index - 1 + nfreed) &
245                                        cq->ibcq.cqe),
246                                cqe,
247                                MTHCA_CQ_ENTRY_SIZE);
248                 --prod_index;
249         }
250
251         if (nfreed) {
252                 wmb();
253                 inc_cons_index(dev, cq, nfreed);
254                 cq->cons_index = (cq->cons_index + nfreed) & cq->ibcq.cqe;
255         }
256
257         spin_unlock_irq(&cq->lock);
258         if (atomic_dec_and_test(&cq->refcount))
259                 wake_up(&cq->wait);
260 }
261
262 static int handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq,
263                             struct mthca_qp *qp, int wqe_index, int is_send,
264                             struct mthca_err_cqe *cqe,
265                             struct ib_wc *entry, int *free_cqe)
266 {
267         int err;
268         int dbd;
269         u32 new_wqe;
270
271         if (1 && cqe->syndrome != SYNDROME_WR_FLUSH_ERR) {
272                 int j;
273
274                 mthca_dbg(dev, "%x/%d: error CQE -> QPN %06x, WQE @ %08x\n",
275                           cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn),
276                           be32_to_cpu(cqe->wqe));
277
278                 for (j = 0; j < 8; ++j)
279                         printk(KERN_DEBUG "  [%2x] %08x\n",
280                                j * 4, be32_to_cpu(((u32 *) cqe)[j]));
281         }
282
283         /*
284          * For completions in error, only work request ID, status (and
285          * freed resource count for RD) have to be set.
286          */
287         switch (cqe->syndrome) {
288         case SYNDROME_LOCAL_LENGTH_ERR:
289                 entry->status = IB_WC_LOC_LEN_ERR;
290                 break;
291         case SYNDROME_LOCAL_QP_OP_ERR:
292                 entry->status = IB_WC_LOC_QP_OP_ERR;
293                 break;
294         case SYNDROME_LOCAL_EEC_OP_ERR:
295                 entry->status = IB_WC_LOC_EEC_OP_ERR;
296                 break;
297         case SYNDROME_LOCAL_PROT_ERR:
298                 entry->status = IB_WC_LOC_PROT_ERR;
299                 break;
300         case SYNDROME_WR_FLUSH_ERR:
301                 entry->status = IB_WC_WR_FLUSH_ERR;
302                 break;
303         case SYNDROME_MW_BIND_ERR:
304                 entry->status = IB_WC_MW_BIND_ERR;
305                 break;
306         case SYNDROME_BAD_RESP_ERR:
307                 entry->status = IB_WC_BAD_RESP_ERR;
308                 break;
309         case SYNDROME_LOCAL_ACCESS_ERR:
310                 entry->status = IB_WC_LOC_ACCESS_ERR;
311                 break;
312         case SYNDROME_REMOTE_INVAL_REQ_ERR:
313                 entry->status = IB_WC_REM_INV_REQ_ERR;
314                 break;
315         case SYNDROME_REMOTE_ACCESS_ERR:
316                 entry->status = IB_WC_REM_ACCESS_ERR;
317                 break;
318         case SYNDROME_REMOTE_OP_ERR:
319                 entry->status = IB_WC_REM_OP_ERR;
320                 break;
321         case SYNDROME_RETRY_EXC_ERR:
322                 entry->status = IB_WC_RETRY_EXC_ERR;
323                 break;
324         case SYNDROME_RNR_RETRY_EXC_ERR:
325                 entry->status = IB_WC_RNR_RETRY_EXC_ERR;
326                 break;
327         case SYNDROME_LOCAL_RDD_VIOL_ERR:
328                 entry->status = IB_WC_LOC_RDD_VIOL_ERR;
329                 break;
330         case SYNDROME_REMOTE_INVAL_RD_REQ_ERR:
331                 entry->status = IB_WC_REM_INV_RD_REQ_ERR;
332                 break;
333         case SYNDROME_REMOTE_ABORTED_ERR:
334                 entry->status = IB_WC_REM_ABORT_ERR;
335                 break;
336         case SYNDROME_INVAL_EECN_ERR:
337                 entry->status = IB_WC_INV_EECN_ERR;
338                 break;
339         case SYNDROME_INVAL_EEC_STATE_ERR:
340                 entry->status = IB_WC_INV_EEC_STATE_ERR;
341                 break;
342         default:
343                 entry->status = IB_WC_GENERAL_ERR;
344                 break;
345         }
346
347         err = mthca_free_err_wqe(qp, is_send, wqe_index, &dbd, &new_wqe);
348         if (err)
349                 return err;
350
351         /*
352          * If we're at the end of the WQE chain, or we've used up our
353          * doorbell count, free the CQE.  Otherwise just update it for
354          * the next poll operation.
355          */
356         if (!(new_wqe & cpu_to_be32(0x3f)) || (!cqe->db_cnt && dbd))
357                 return 0;
358
359         cqe->db_cnt   = cpu_to_be16(be16_to_cpu(cqe->db_cnt) - dbd);
360         cqe->wqe      = new_wqe;
361         cqe->syndrome = SYNDROME_WR_FLUSH_ERR;
362
363         *free_cqe = 0;
364
365         return 0;
366 }
367
368 static void dump_cqe(struct mthca_cqe *cqe)
369 {
370         int j;
371
372         for (j = 0; j < 8; ++j)
373                 printk(KERN_DEBUG "  [%2x] %08x\n",
374                        j * 4, be32_to_cpu(((u32 *) cqe)[j]));
375 }
376
377 static inline int mthca_poll_one(struct mthca_dev *dev,
378                                  struct mthca_cq *cq,
379                                  struct mthca_qp **cur_qp,
380                                  int *freed,
381                                  struct ib_wc *entry)
382 {
383         struct mthca_wq *wq;
384         struct mthca_cqe *cqe;
385         int wqe_index;
386         int is_error = 0;
387         int is_send;
388         int free_cqe = 1;
389         int err = 0;
390
391         if (!next_cqe_sw(cq))
392                 return -EAGAIN;
393
394         /*
395          * Make sure we read CQ entry contents after we've checked the
396          * ownership bit.
397          */
398         rmb();
399
400         cqe = get_cqe(cq, cq->cons_index);
401
402         if (0) {
403                 mthca_dbg(dev, "%x/%d: CQE -> QPN %06x, WQE @ %08x\n",
404                           cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn),
405                           be32_to_cpu(cqe->wqe));
406
407                 dump_cqe(cqe);
408         }
409
410         if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
411             MTHCA_ERROR_CQE_OPCODE_MASK) {
412                 is_error = 1;
413                 is_send = cqe->opcode & 1;
414         } else
415                 is_send = cqe->is_send & 0x80;
416
417         if (!*cur_qp || be32_to_cpu(cqe->my_qpn) != (*cur_qp)->qpn) {
418                 if (*cur_qp) {
419                         if (*freed) {
420                                 wmb();
421                                 inc_cons_index(dev, cq, *freed);
422                                 *freed = 0;
423                         }
424                         spin_unlock(&(*cur_qp)->lock);
425                 }
426
427                 spin_lock(&dev->qp_table.lock);
428                 *cur_qp = mthca_array_get(&dev->qp_table.qp,
429                                           be32_to_cpu(cqe->my_qpn) &
430                                           (dev->limits.num_qps - 1));
431                 if (*cur_qp)
432                         atomic_inc(&(*cur_qp)->refcount);
433                 spin_unlock(&dev->qp_table.lock);
434
435                 if (!*cur_qp) {
436                         mthca_warn(dev, "CQ entry for unknown QP %06x\n",
437                                    be32_to_cpu(cqe->my_qpn) & 0xffffff);
438                         err = -EINVAL;
439                         goto out;
440                 }
441
442                 spin_lock(&(*cur_qp)->lock);
443         }
444
445         entry->qp_num = (*cur_qp)->qpn;
446
447         if (is_send) {
448                 wq = &(*cur_qp)->sq;
449                 wqe_index = ((be32_to_cpu(cqe->wqe) - (*cur_qp)->send_wqe_offset)
450                              >> wq->wqe_shift);
451                 entry->wr_id = (*cur_qp)->wrid[wqe_index +
452                                                (*cur_qp)->rq.max];
453         } else {
454                 wq = &(*cur_qp)->rq;
455                 wqe_index = be32_to_cpu(cqe->wqe) >> wq->wqe_shift;
456                 entry->wr_id = (*cur_qp)->wrid[wqe_index];
457         }
458
459         if (wq->last_comp < wqe_index)
460                 wq->cur -= wqe_index - wq->last_comp;
461         else
462                 wq->cur -= wq->max - wq->last_comp + wqe_index;
463
464         wq->last_comp = wqe_index;
465
466         if (0)
467                 mthca_dbg(dev, "%s completion for QP %06x, index %d (nr %d)\n",
468                           is_send ? "Send" : "Receive",
469                           (*cur_qp)->qpn, wqe_index, wq->max);
470
471         if (is_error) {
472                 err = handle_error_cqe(dev, cq, *cur_qp, wqe_index, is_send,
473                                        (struct mthca_err_cqe *) cqe,
474                                        entry, &free_cqe);
475                 goto out;
476         }
477
478         if (is_send) {
479                 entry->opcode = IB_WC_SEND; /* XXX */
480         } else {
481                 entry->byte_len = be32_to_cpu(cqe->byte_cnt);
482                 switch (cqe->opcode & 0x1f) {
483                 case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE:
484                 case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE:
485                         entry->wc_flags = IB_WC_WITH_IMM;
486                         entry->imm_data = cqe->imm_etype_pkey_eec;
487                         entry->opcode = IB_WC_RECV;
488                         break;
489                 case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
490                 case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
491                         entry->wc_flags = IB_WC_WITH_IMM;
492                         entry->imm_data = cqe->imm_etype_pkey_eec;
493                         entry->opcode = IB_WC_RECV_RDMA_WITH_IMM;
494                         break;
495                 default:
496                         entry->wc_flags = 0;
497                         entry->opcode = IB_WC_RECV;
498                         break;
499                 }
500                 entry->slid        = be16_to_cpu(cqe->rlid);
501                 entry->sl          = be16_to_cpu(cqe->sl_g_mlpath) >> 12;
502                 entry->src_qp      = be32_to_cpu(cqe->rqpn) & 0xffffff;
503                 entry->dlid_path_bits = be16_to_cpu(cqe->sl_g_mlpath) & 0x7f;
504                 entry->pkey_index  = be32_to_cpu(cqe->imm_etype_pkey_eec) >> 16;
505                 entry->wc_flags   |= be16_to_cpu(cqe->sl_g_mlpath) & 0x80 ?
506                                         IB_WC_GRH : 0;
507         }
508
509         entry->status = IB_WC_SUCCESS;
510
511  out:
512         if (free_cqe) {
513                 set_cqe_hw(cq, cq->cons_index);
514                 ++(*freed);
515                 cq->cons_index = (cq->cons_index + 1) & cq->ibcq.cqe;
516         }
517
518         return err;
519 }
520
521 int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
522                   struct ib_wc *entry)
523 {
524         struct mthca_dev *dev = to_mdev(ibcq->device);
525         struct mthca_cq *cq = to_mcq(ibcq);
526         struct mthca_qp *qp = NULL;
527         unsigned long flags;
528         int err = 0;
529         int freed = 0;
530         int npolled;
531
532         spin_lock_irqsave(&cq->lock, flags);
533
534         for (npolled = 0; npolled < num_entries; ++npolled) {
535                 err = mthca_poll_one(dev, cq, &qp,
536                                      &freed, entry + npolled);
537                 if (err)
538                         break;
539         }
540
541         if (freed) {
542                 wmb();
543                 inc_cons_index(dev, cq, freed);
544         }
545
546         if (qp) {
547                 spin_unlock(&qp->lock);
548                 if (atomic_dec_and_test(&qp->refcount))
549                         wake_up(&qp->wait);
550         }
551
552
553         spin_unlock_irqrestore(&cq->lock, flags);
554
555         return err == 0 || err == -EAGAIN ? npolled : err;
556 }
557
558 void mthca_arm_cq(struct mthca_dev *dev, struct mthca_cq *cq,
559                   int solicited)
560 {
561         u32 doorbell[2];
562
563         doorbell[0] =  cpu_to_be32((solicited ?
564                                     MTHCA_CQ_DB_REQ_NOT_SOL :
565                                     MTHCA_CQ_DB_REQ_NOT)      |
566                                    cq->cqn);
567         doorbell[1] = 0xffffffff;
568
569         mthca_write64(doorbell,
570                       dev->kar + MTHCA_CQ_DOORBELL,
571                       MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
572 }
573
574 int mthca_init_cq(struct mthca_dev *dev, int nent,
575                   struct mthca_cq *cq)
576 {
577         int size = nent * MTHCA_CQ_ENTRY_SIZE;
578         dma_addr_t t;
579         void *mailbox = NULL;
580         int npages, shift;
581         u64 *dma_list = NULL;
582         struct mthca_cq_context *cq_context;
583         int err = -ENOMEM;
584         u8 status;
585         int i;
586
587         might_sleep();
588
589         mailbox = kmalloc(sizeof (struct mthca_cq_context) + MTHCA_CMD_MAILBOX_EXTRA,
590                           GFP_KERNEL);
591         if (!mailbox)
592                 goto err_out;
593
594         cq_context = MAILBOX_ALIGN(mailbox);
595
596         if (size <= MTHCA_MAX_DIRECT_CQ_SIZE) {
597                 if (0)
598                         mthca_dbg(dev, "Creating direct CQ of size %d\n", size);
599
600                 cq->is_direct = 1;
601                 npages        = 1;
602                 shift         = get_order(size) + PAGE_SHIFT;
603
604                 cq->queue.direct.buf = pci_alloc_consistent(dev->pdev,
605                                                             size, &t);
606                 if (!cq->queue.direct.buf)
607                         goto err_out;
608
609                 pci_unmap_addr_set(&cq->queue.direct, mapping, t);
610
611                 memset(cq->queue.direct.buf, 0, size);
612
613                 while (t & ((1 << shift) - 1)) {
614                         --shift;
615                         npages *= 2;
616                 }
617
618                 dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
619                 if (!dma_list)
620                         goto err_out_free;
621
622                 for (i = 0; i < npages; ++i)
623                         dma_list[i] = t + i * (1 << shift);
624         } else {
625                 cq->is_direct = 0;
626                 npages        = (size + PAGE_SIZE - 1) / PAGE_SIZE;
627                 shift         = PAGE_SHIFT;
628
629                 if (0)
630                         mthca_dbg(dev, "Creating indirect CQ with %d pages\n", npages);
631
632                 dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
633                 if (!dma_list)
634                         goto err_out;
635
636                 cq->queue.page_list = kmalloc(npages * sizeof *cq->queue.page_list,
637                                               GFP_KERNEL);
638                 if (!cq->queue.page_list)
639                         goto err_out;
640
641                 for (i = 0; i < npages; ++i)
642                         cq->queue.page_list[i].buf = NULL;
643
644                 for (i = 0; i < npages; ++i) {
645                         cq->queue.page_list[i].buf =
646                                 pci_alloc_consistent(dev->pdev, PAGE_SIZE, &t);
647                         if (!cq->queue.page_list[i].buf)
648                                 goto err_out_free;
649
650                         dma_list[i] = t;
651                         pci_unmap_addr_set(&cq->queue.page_list[i], mapping, t);
652
653                         memset(cq->queue.page_list[i].buf, 0, PAGE_SIZE);
654                 }
655         }
656
657         for (i = 0; i < nent; ++i)
658                 set_cqe_hw(cq, i);
659
660         cq->cqn = mthca_alloc(&dev->cq_table.alloc);
661         if (cq->cqn == -1)
662                 goto err_out_free;
663
664         err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num,
665                                   dma_list, shift, npages,
666                                   0, size,
667                                   MTHCA_MPT_FLAG_LOCAL_WRITE |
668                                   MTHCA_MPT_FLAG_LOCAL_READ,
669                                   &cq->mr);
670         if (err)
671                 goto err_out_free_cq;
672
673         spin_lock_init(&cq->lock);
674         atomic_set(&cq->refcount, 1);
675         init_waitqueue_head(&cq->wait);
676
677         memset(cq_context, 0, sizeof *cq_context);
678         cq_context->flags           = cpu_to_be32(MTHCA_CQ_STATUS_OK      |
679                                                   MTHCA_CQ_STATE_DISARMED |
680                                                   MTHCA_CQ_FLAG_TR);
681         cq_context->start           = cpu_to_be64(0);
682         cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24 |
683                                                   MTHCA_KAR_PAGE);
684         cq_context->error_eqn       = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
685         cq_context->comp_eqn        = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_COMP].eqn);
686         cq_context->pd              = cpu_to_be32(dev->driver_pd.pd_num);
687         cq_context->lkey            = cpu_to_be32(cq->mr.ibmr.lkey);
688         cq_context->cqn             = cpu_to_be32(cq->cqn);
689
690         err = mthca_SW2HW_CQ(dev, cq_context, cq->cqn, &status);
691         if (err) {
692                 mthca_warn(dev, "SW2HW_CQ failed (%d)\n", err);
693                 goto err_out_free_mr;
694         }
695
696         if (status) {
697                 mthca_warn(dev, "SW2HW_CQ returned status 0x%02x\n",
698                            status);
699                 err = -EINVAL;
700                 goto err_out_free_mr;
701         }
702
703         spin_lock_irq(&dev->cq_table.lock);
704         if (mthca_array_set(&dev->cq_table.cq,
705                             cq->cqn & (dev->limits.num_cqs - 1),
706                             cq)) {
707                 spin_unlock_irq(&dev->cq_table.lock);
708                 goto err_out_free_mr;
709         }
710         spin_unlock_irq(&dev->cq_table.lock);
711
712         cq->cons_index = 0;
713
714         kfree(dma_list);
715         kfree(mailbox);
716
717         return 0;
718
719  err_out_free_mr:
720         mthca_free_mr(dev, &cq->mr);
721
722  err_out_free_cq:
723         mthca_free(&dev->cq_table.alloc, cq->cqn);
724
725  err_out_free:
726         if (cq->is_direct)
727                 pci_free_consistent(dev->pdev, size,
728                                     cq->queue.direct.buf,
729                                     pci_unmap_addr(&cq->queue.direct, mapping));
730         else {
731                 for (i = 0; i < npages; ++i)
732                         if (cq->queue.page_list[i].buf)
733                                 pci_free_consistent(dev->pdev, PAGE_SIZE,
734                                                     cq->queue.page_list[i].buf,
735                                                     pci_unmap_addr(&cq->queue.page_list[i],
736                                                                    mapping));
737
738                 kfree(cq->queue.page_list);
739         }
740
741  err_out:
742         kfree(dma_list);
743         kfree(mailbox);
744
745         return err;
746 }
747
748 void mthca_free_cq(struct mthca_dev *dev,
749                    struct mthca_cq *cq)
750 {
751         void *mailbox;
752         int err;
753         u8 status;
754
755         might_sleep();
756
757         mailbox = kmalloc(sizeof (struct mthca_cq_context) + MTHCA_CMD_MAILBOX_EXTRA,
758                           GFP_KERNEL);
759         if (!mailbox) {
760                 mthca_warn(dev, "No memory for mailbox to free CQ.\n");
761                 return;
762         }
763
764         err = mthca_HW2SW_CQ(dev, MAILBOX_ALIGN(mailbox), cq->cqn, &status);
765         if (err)
766                 mthca_warn(dev, "HW2SW_CQ failed (%d)\n", err);
767         else if (status)
768                 mthca_warn(dev, "HW2SW_CQ returned status 0x%02x\n",
769                            status);
770
771         if (0) {
772                 u32 *ctx = MAILBOX_ALIGN(mailbox);
773                 int j;
774
775                 printk(KERN_ERR "context for CQN %x (cons index %x, next sw %d)\n",
776                        cq->cqn, cq->cons_index, next_cqe_sw(cq));
777                 for (j = 0; j < 16; ++j)
778                         printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j]));
779         }
780
781         spin_lock_irq(&dev->cq_table.lock);
782         mthca_array_clear(&dev->cq_table.cq,
783                           cq->cqn & (dev->limits.num_cqs - 1));
784         spin_unlock_irq(&dev->cq_table.lock);
785
786         atomic_dec(&cq->refcount);
787         wait_event(cq->wait, !atomic_read(&cq->refcount));
788
789         mthca_free_mr(dev, &cq->mr);
790
791         if (cq->is_direct)
792                 pci_free_consistent(dev->pdev,
793                                     (cq->ibcq.cqe + 1) * MTHCA_CQ_ENTRY_SIZE,
794                                     cq->queue.direct.buf,
795                                     pci_unmap_addr(&cq->queue.direct,
796                                                    mapping));
797         else {
798                 int i;
799
800                 for (i = 0;
801                      i < ((cq->ibcq.cqe + 1) * MTHCA_CQ_ENTRY_SIZE + PAGE_SIZE - 1) /
802                              PAGE_SIZE;
803                      ++i)
804                         pci_free_consistent(dev->pdev, PAGE_SIZE,
805                                             cq->queue.page_list[i].buf,
806                                             pci_unmap_addr(&cq->queue.page_list[i],
807                                                            mapping));
808
809                 kfree(cq->queue.page_list);
810         }
811
812         mthca_free(&dev->cq_table.alloc, cq->cqn);
813         kfree(mailbox);
814 }
815
816 int __devinit mthca_init_cq_table(struct mthca_dev *dev)
817 {
818         int err;
819
820         spin_lock_init(&dev->cq_table.lock);
821
822         err = mthca_alloc_init(&dev->cq_table.alloc,
823                                dev->limits.num_cqs,
824                                (1 << 24) - 1,
825                                dev->limits.reserved_cqs);
826         if (err)
827                 return err;
828
829         err = mthca_array_init(&dev->cq_table.cq,
830                                dev->limits.num_cqs);
831         if (err)
832                 mthca_alloc_cleanup(&dev->cq_table.alloc);
833
834         return err;
835 }
836
837 void __devexit mthca_cleanup_cq_table(struct mthca_dev *dev)
838 {
839         mthca_array_cleanup(&dev->cq_table.cq, dev->limits.num_cqs);
840         mthca_alloc_cleanup(&dev->cq_table.alloc);
841 }