Merge branch 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband
authorLinus Torvalds <torvalds@woody.linux-foundation.org>
Wed, 14 Feb 2007 05:16:39 +0000 (21:16 -0800)
committerLinus Torvalds <torvalds@woody.linux-foundation.org>
Wed, 14 Feb 2007 05:16:39 +0000 (21:16 -0800)
* 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband:
  IB/mthca: Always fill MTTs from CPU
  IB/mthca: Merge MR and FMR space on 64-bit systems
  IB/mthca: Fix access to MTT and MPT tables on non-cache-coherent CPUs
  IB/mthca: Give reserved MTTs a separate cache line
  IB/mthca: Fix reserved MTTs calculation on mem-free HCAs
  RDMA/cxgb3: Add driver for Chelsio T3 RNIC
  IB: Remove redundant "_wq" from workqueue names
  RDMA/cma: Increment port number after close to avoid re-use
  IB/ehca: Fix memleak on module unloading
  IB/mthca: Work around gcc bug on sparc64
  IPoIB: Connected mode experimental support
  IB/core: Use ARRAY_SIZE macro for mandatory_table
  IB/mthca: Use correct structure size in call to memset()

46 files changed:
drivers/infiniband/Kconfig
drivers/infiniband/Makefile
drivers/infiniband/core/addr.c
drivers/infiniband/core/cma.c
drivers/infiniband/core/device.c
drivers/infiniband/hw/cxgb3/Kconfig [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/Makefile [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/cxio_dbg.c [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/cxio_hal.c [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/cxio_hal.h [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/cxio_resource.c [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/cxio_resource.h [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/cxio_wr.h [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/iwch.c [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/iwch.h [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/iwch_cm.c [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/iwch_cm.h [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/iwch_cq.c [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/iwch_ev.c [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/iwch_mem.c [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/iwch_provider.c [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/iwch_provider.h [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/iwch_qp.c [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/iwch_user.h [new file with mode: 0644]
drivers/infiniband/hw/cxgb3/tcb.h [new file with mode: 0644]
drivers/infiniband/hw/ehca/ehca_irq.c
drivers/infiniband/hw/mthca/mthca_cmd.c
drivers/infiniband/hw/mthca/mthca_dev.h
drivers/infiniband/hw/mthca/mthca_main.c
drivers/infiniband/hw/mthca/mthca_memfree.c
drivers/infiniband/hw/mthca/mthca_memfree.h
drivers/infiniband/hw/mthca/mthca_mr.c
drivers/infiniband/hw/mthca/mthca_profile.c
drivers/infiniband/hw/mthca/mthca_provider.c
drivers/infiniband/hw/mthca/mthca_provider.h
drivers/infiniband/hw/mthca/mthca_qp.c
drivers/infiniband/hw/mthca/mthca_srq.c
drivers/infiniband/ulp/ipoib/Kconfig
drivers/infiniband/ulp/ipoib/Makefile
drivers/infiniband/ulp/ipoib/ipoib.h
drivers/infiniband/ulp/ipoib/ipoib_cm.c [new file with mode: 0644]
drivers/infiniband/ulp/ipoib/ipoib_ib.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/ipoib/ipoib_multicast.c
drivers/infiniband/ulp/ipoib/ipoib_verbs.c
drivers/infiniband/ulp/ipoib/ipoib_vlan.c

index 9edfacee7d8442cb6e5c49db23d493829991d540..66b36de9fa6f531a3a9b329ac3177ad0cceead68 100644 (file)
@@ -38,6 +38,7 @@ source "drivers/infiniband/hw/mthca/Kconfig"
 source "drivers/infiniband/hw/ipath/Kconfig"
 source "drivers/infiniband/hw/ehca/Kconfig"
 source "drivers/infiniband/hw/amso1100/Kconfig"
+source "drivers/infiniband/hw/cxgb3/Kconfig"
 
 source "drivers/infiniband/ulp/ipoib/Kconfig"
 
index 2b5d1098ef45f4c3ba9fa5ccfed4c2aa76b0e7c3..da2066c4f22c03da192a390a5d5893226f6ff7f6 100644 (file)
@@ -3,6 +3,7 @@ obj-$(CONFIG_INFINIBAND_MTHCA)          += hw/mthca/
 obj-$(CONFIG_INFINIBAND_IPATH)         += hw/ipath/
 obj-$(CONFIG_INFINIBAND_EHCA)          += hw/ehca/
 obj-$(CONFIG_INFINIBAND_AMSO1100)      += hw/amso1100/
+obj-$(CONFIG_INFINIBAND_CXGB3)         += hw/cxgb3/
 obj-$(CONFIG_INFINIBAND_IPOIB)         += ulp/ipoib/
 obj-$(CONFIG_INFINIBAND_SRP)           += ulp/srp/
 obj-$(CONFIG_INFINIBAND_ISER)          += ulp/iser/
index d2bb5a9a303fcb7d86f3a38deea2e33e7d8e9c2c..a91001c59b69d66a57770457c2c464ea0c5d14ff 100644 (file)
@@ -373,7 +373,7 @@ static struct notifier_block nb = {
 
 static int addr_init(void)
 {
-       addr_wq = create_singlethread_workqueue("ib_addr_wq");
+       addr_wq = create_singlethread_workqueue("ib_addr");
        if (!addr_wq)
                return -ENOMEM;
 
index 9e0ab048c878eba5b07d57501762f83c3f1f2356..db88e609bf429f6a1fb9f37527f0c21651d14320 100644 (file)
@@ -71,6 +71,7 @@ static struct workqueue_struct *cma_wq;
 static DEFINE_IDR(sdp_ps);
 static DEFINE_IDR(tcp_ps);
 static DEFINE_IDR(udp_ps);
+static int next_port;
 
 struct cma_device {
        struct list_head        list;
@@ -1722,33 +1723,74 @@ static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
                          unsigned short snum)
 {
        struct rdma_bind_list *bind_list;
-       int port, start, ret;
+       int port, ret;
 
        bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
        if (!bind_list)
                return -ENOMEM;
 
-       start = snum ? snum : sysctl_local_port_range[0];
+       do {
+               ret = idr_get_new_above(ps, bind_list, snum, &port);
+       } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
+
+       if (ret)
+               goto err1;
+
+       if (port != snum) {
+               ret = -EADDRNOTAVAIL;
+               goto err2;
+       }
+
+       bind_list->ps = ps;
+       bind_list->port = (unsigned short) port;
+       cma_bind_port(bind_list, id_priv);
+       return 0;
+err2:
+       idr_remove(ps, port);
+err1:
+       kfree(bind_list);
+       return ret;
+}
 
+static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
+{
+       struct rdma_bind_list *bind_list;
+       int port, ret;
+
+       bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
+       if (!bind_list)
+               return -ENOMEM;
+
+retry:
        do {
-               ret = idr_get_new_above(ps, bind_list, start, &port);
+               ret = idr_get_new_above(ps, bind_list, next_port, &port);
        } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
 
        if (ret)
-               goto err;
+               goto err1;
 
-       if ((snum && port != snum) ||
-           (!snum && port > sysctl_local_port_range[1])) {
-               idr_remove(ps, port);
+       if (port > sysctl_local_port_range[1]) {
+               if (next_port != sysctl_local_port_range[0]) {
+                       idr_remove(ps, port);
+                       next_port = sysctl_local_port_range[0];
+                       goto retry;
+               }
                ret = -EADDRNOTAVAIL;
-               goto err;
+               goto err2;
        }
 
+       if (port == sysctl_local_port_range[1])
+               next_port = sysctl_local_port_range[0];
+       else
+               next_port = port + 1;
+
        bind_list->ps = ps;
        bind_list->port = (unsigned short) port;
        cma_bind_port(bind_list, id_priv);
        return 0;
-err:
+err2:
+       idr_remove(ps, port);
+err1:
        kfree(bind_list);
        return ret;
 }
@@ -1811,7 +1853,7 @@ static int cma_get_port(struct rdma_id_private *id_priv)
 
        mutex_lock(&lock);
        if (cma_any_port(&id_priv->id.route.addr.src_addr))
-               ret = cma_alloc_port(ps, id_priv, 0);
+               ret = cma_alloc_any_port(ps, id_priv);
        else
                ret = cma_use_port(ps, id_priv);
        mutex_unlock(&lock);
@@ -2448,7 +2490,11 @@ static int cma_init(void)
 {
        int ret;
 
-       cma_wq = create_singlethread_workqueue("rdma_cm_wq");
+       get_random_bytes(&next_port, sizeof next_port);
+       next_port = (next_port % (sysctl_local_port_range[1] -
+                                 sysctl_local_port_range[0])) +
+                   sysctl_local_port_range[0];
+       cma_wq = create_singlethread_workqueue("rdma_cm");
        if (!cma_wq)
                return -ENOMEM;
 
index 63d2a39fb82c84a83aab4454f6336d81efbc41ae..7fabb425b033a2de2243d50b24164937c0e99e49 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/errno.h>
+#include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
@@ -93,7 +94,7 @@ static int ib_device_check_mandatory(struct ib_device *device)
        };
        int i;
 
-       for (i = 0; i < sizeof mandatory_table / sizeof mandatory_table[0]; ++i) {
+       for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
                if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
                        printk(KERN_WARNING "Device %s is missing mandatory function %s\n",
                               device->name, mandatory_table[i].name);
diff --git a/drivers/infiniband/hw/cxgb3/Kconfig b/drivers/infiniband/hw/cxgb3/Kconfig
new file mode 100644 (file)
index 0000000..77977f5
--- /dev/null
@@ -0,0 +1,27 @@
+config INFINIBAND_CXGB3
+       tristate "Chelsio RDMA Driver"
+       depends on CHELSIO_T3 && INFINIBAND && INET
+       select GENERIC_ALLOCATOR
+       ---help---
+         This is an iWARP/RDMA driver for the Chelsio T3 1GbE and
+         10GbE adapters.
+
+         For general information about Chelsio and our products, visit
+         our website at <http://www.chelsio.com>.
+
+         For customer support, please visit our customer support page at
+         <http://www.chelsio.com/support.htm>.
+
+         Please send feedback to <linux-bugs@chelsio.com>.
+
+         To compile this driver as a module, choose M here: the module
+         will be called iw_cxgb3.
+
+config INFINIBAND_CXGB3_DEBUG
+       bool "Verbose debugging output"
+       depends on INFINIBAND_CXGB3
+       default n
+       ---help---
+         This option causes the Chelsio RDMA driver to produce copious
+         amounts of debug messages.  Select this if you are developing
+         the driver or trying to diagnose a problem.
diff --git a/drivers/infiniband/hw/cxgb3/Makefile b/drivers/infiniband/hw/cxgb3/Makefile
new file mode 100644 (file)
index 0000000..0e110f3
--- /dev/null
@@ -0,0 +1,12 @@
+EXTRA_CFLAGS += -I$(TOPDIR)/drivers/net/cxgb3 \
+               -I$(TOPDIR)/drivers/infiniband/hw/cxgb3/core
+
+obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o
+
+iw_cxgb3-y :=  iwch_cm.o iwch_ev.o iwch_cq.o iwch_qp.o iwch_mem.o \
+              iwch_provider.o iwch.o cxio_hal.o cxio_resource.o
+
+ifdef CONFIG_INFINIBAND_CXGB3_DEBUG
+EXTRA_CFLAGS += -DDEBUG
+iw_cxgb3-y += cxio_dbg.o
+endif
diff --git a/drivers/infiniband/hw/cxgb3/cxio_dbg.c b/drivers/infiniband/hw/cxgb3/cxio_dbg.c
new file mode 100644 (file)
index 0000000..5a7306f
--- /dev/null
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef DEBUG
+#include <linux/types.h>
+#include "common.h"
+#include "cxgb3_ioctl.h"
+#include "cxio_hal.h"
+#include "cxio_wr.h"
+
+void cxio_dump_tpt(struct cxio_rdev *rdev, u32 stag)
+{
+       struct ch_mem_range *m;
+       u64 *data;
+       int rc;
+       int size = 32;
+
+       m = kmalloc(sizeof(*m) + size, GFP_ATOMIC);
+       if (!m) {
+               PDBG("%s couldn't allocate memory.\n", __FUNCTION__);
+               return;
+       }
+       m->mem_id = MEM_PMRX;
+       m->addr = (stag>>8) * 32 + rdev->rnic_info.tpt_base;
+       m->len = size;
+       PDBG("%s TPT addr 0x%x len %d\n", __FUNCTION__, m->addr, m->len);
+       rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+       if (rc) {
+               PDBG("%s toectl returned error %d\n", __FUNCTION__, rc);
+               kfree(m);
+               return;
+       }
+
+       data = (u64 *)m->buf;
+       while (size > 0) {
+               PDBG("TPT %08x: %016llx\n", m->addr, (unsigned long long) *data);
+               size -= 8;
+               data++;
+               m->addr += 8;
+       }
+       kfree(m);
+}
+
+void cxio_dump_pbl(struct cxio_rdev *rdev, u32 pbl_addr, uint len, u8 shift)
+{
+       struct ch_mem_range *m;
+       u64 *data;
+       int rc;
+       int size, npages;
+
+       shift += 12;
+       npages = (len + (1ULL << shift) - 1) >> shift;
+       size = npages * sizeof(u64);
+
+       m = kmalloc(sizeof(*m) + size, GFP_ATOMIC);
+       if (!m) {
+               PDBG("%s couldn't allocate memory.\n", __FUNCTION__);
+               return;
+       }
+       m->mem_id = MEM_PMRX;
+       m->addr = pbl_addr;
+       m->len = size;
+       PDBG("%s PBL addr 0x%x len %d depth %d\n",
+               __FUNCTION__, m->addr, m->len, npages);
+       rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+       if (rc) {
+               PDBG("%s toectl returned error %d\n", __FUNCTION__, rc);
+               kfree(m);
+               return;
+       }
+
+       data = (u64 *)m->buf;
+       while (size > 0) {
+               PDBG("PBL %08x: %016llx\n", m->addr, (unsigned long long) *data);
+               size -= 8;
+               data++;
+               m->addr += 8;
+       }
+       kfree(m);
+}
+
+void cxio_dump_wqe(union t3_wr *wqe)
+{
+       __be64 *data = (__be64 *)wqe;
+       uint size = (uint)(be64_to_cpu(*data) & 0xff);
+
+       if (size == 0)
+               size = 8;
+       while (size > 0) {
+               PDBG("WQE %p: %016llx\n", data,
+                    (unsigned long long) be64_to_cpu(*data));
+               size--;
+               data++;
+       }
+}
+
+void cxio_dump_wce(struct t3_cqe *wce)
+{
+       __be64 *data = (__be64 *)wce;
+       int size = sizeof(*wce);
+
+       while (size > 0) {
+               PDBG("WCE %p: %016llx\n", data,
+                    (unsigned long long) be64_to_cpu(*data));
+               size -= 8;
+               data++;
+       }
+}
+
+void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents)
+{
+       struct ch_mem_range *m;
+       int size = nents * 64;
+       u64 *data;
+       int rc;
+
+       m = kmalloc(sizeof(*m) + size, GFP_ATOMIC);
+       if (!m) {
+               PDBG("%s couldn't allocate memory.\n", __FUNCTION__);
+               return;
+       }
+       m->mem_id = MEM_PMRX;
+       m->addr = ((hwtid)<<10) + rdev->rnic_info.rqt_base;
+       m->len = size;
+       PDBG("%s RQT addr 0x%x len %d\n", __FUNCTION__, m->addr, m->len);
+       rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+       if (rc) {
+               PDBG("%s toectl returned error %d\n", __FUNCTION__, rc);
+               kfree(m);
+               return;
+       }
+
+       data = (u64 *)m->buf;
+       while (size > 0) {
+               PDBG("RQT %08x: %016llx\n", m->addr, (unsigned long long) *data);
+               size -= 8;
+               data++;
+               m->addr += 8;
+       }
+       kfree(m);
+}
+
+void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid)
+{
+       struct ch_mem_range *m;
+       int size = TCB_SIZE;
+       u32 *data;
+       int rc;
+
+       m = kmalloc(sizeof(*m) + size, GFP_ATOMIC);
+       if (!m) {
+               PDBG("%s couldn't allocate memory.\n", __FUNCTION__);
+               return;
+       }
+       m->mem_id = MEM_CM;
+       m->addr = hwtid * size;
+       m->len = size;
+       PDBG("%s TCB %d len %d\n", __FUNCTION__, m->addr, m->len);
+       rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+       if (rc) {
+               PDBG("%s toectl returned error %d\n", __FUNCTION__, rc);
+               kfree(m);
+               return;
+       }
+
+       data = (u32 *)m->buf;
+       while (size > 0) {
+               printk("%2u: %08x %08x %08x %08x %08x %08x %08x %08x\n",
+                       m->addr,
+                       *(data+2), *(data+3), *(data),*(data+1),
+                       *(data+6), *(data+7), *(data+4), *(data+5));
+               size -= 32;
+               data += 8;
+               m->addr += 32;
+       }
+       kfree(m);
+}
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
new file mode 100644 (file)
index 0000000..82fa720
--- /dev/null
@@ -0,0 +1,1280 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <asm/delay.h>
+
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+
+#include "cxio_resource.h"
+#include "cxio_hal.h"
+#include "cxgb3_offload.h"
+#include "sge_defs.h"
+
+static LIST_HEAD(rdev_list);
+static cxio_hal_ev_callback_func_t cxio_ev_cb = NULL;
+
+static inline struct cxio_rdev *cxio_hal_find_rdev_by_name(char *dev_name)
+{
+       struct cxio_rdev *rdev;
+
+       list_for_each_entry(rdev, &rdev_list, entry)
+               if (!strcmp(rdev->dev_name, dev_name))
+                       return rdev;
+       return NULL;
+}
+
+static inline struct cxio_rdev *cxio_hal_find_rdev_by_t3cdev(struct t3cdev
+                                                            *tdev)
+{
+       struct cxio_rdev *rdev;
+
+       list_for_each_entry(rdev, &rdev_list, entry)
+               if (rdev->t3cdev_p == tdev)
+                       return rdev;
+       return NULL;
+}
+
+int cxio_hal_cq_op(struct cxio_rdev *rdev_p, struct t3_cq *cq,
+                  enum t3_cq_opcode op, u32 credit)
+{
+       int ret;
+       struct t3_cqe *cqe;
+       u32 rptr;
+
+       struct rdma_cq_op setup;
+       setup.id = cq->cqid;
+       setup.credits = (op == CQ_CREDIT_UPDATE) ? credit : 0;
+       setup.op = op;
+       ret = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_OP, &setup);
+
+       if ((ret < 0) || (op == CQ_CREDIT_UPDATE))
+               return ret;
+
+       /*
+        * If the rearm returned an index other than our current index,
+        * then there might be CQE's in flight (being DMA'd).  We must wait
+        * here for them to complete or the consumer can miss a notification.
+        */
+       if (Q_PTR2IDX((cq->rptr), cq->size_log2) != ret) {
+               int i=0;
+
+               rptr = cq->rptr;
+
+               /*
+                * Keep the generation correct by bumping rptr until it
+                * matches the index returned by the rearm - 1.
+                */
+               while (Q_PTR2IDX((rptr+1), cq->size_log2) != ret)
+                       rptr++;
+
+               /*
+                * Now rptr is the index for the (last) cqe that was
+                * in-flight at the time the HW rearmed the CQ.  We
+                * spin until that CQE is valid.
+                */
+               cqe = cq->queue + Q_PTR2IDX(rptr, cq->size_log2);
+               while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) {
+                       udelay(1);
+                       if (i++ > 1000000) {
+                               BUG_ON(1);
+                               printk(KERN_ERR "%s: stalled rnic\n",
+                                      rdev_p->dev_name);
+                               return -EIO;
+                       }
+               }
+       }
+       return 0;
+}
+
+static inline int cxio_hal_clear_cq_ctx(struct cxio_rdev *rdev_p, u32 cqid)
+{
+       struct rdma_cq_setup setup;
+       setup.id = cqid;
+       setup.base_addr = 0;    /* NULL address */
+       setup.size = 0;         /* disaable the CQ */
+       setup.credits = 0;
+       setup.credit_thres = 0;
+       setup.ovfl_mode = 0;
+       return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid)
+{
+       u64 sge_cmd;
+       struct t3_modify_qp_wr *wqe;
+       struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_KERNEL);
+       if (!skb) {
+               PDBG("%s alloc_skb failed\n", __FUNCTION__);
+               return -ENOMEM;
+       }
+       wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe));
+       memset(wqe, 0, sizeof(*wqe));
+       build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 3, 1, qpid, 7);
+       wqe->flags = cpu_to_be32(MODQP_WRITE_EC);
+       sge_cmd = qpid << 8 | 3;
+       wqe->sge_cmd = cpu_to_be64(sge_cmd);
+       skb->priority = CPL_PRIORITY_CONTROL;
+       return (cxgb3_ofld_send(rdev_p->t3cdev_p, skb));
+}
+
+int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+{
+       struct rdma_cq_setup setup;
+       int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe);
+
+       cq->cqid = cxio_hal_get_cqid(rdev_p->rscp);
+       if (!cq->cqid)
+               return -ENOMEM;
+       cq->sw_queue = kzalloc(size, GFP_KERNEL);
+       if (!cq->sw_queue)
+               return -ENOMEM;
+       cq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev),
+                                            (1UL << (cq->size_log2)) *
+                                            sizeof(struct t3_cqe),
+                                            &(cq->dma_addr), GFP_KERNEL);
+       if (!cq->queue) {
+               kfree(cq->sw_queue);
+               return -ENOMEM;
+       }
+       pci_unmap_addr_set(cq, mapping, cq->dma_addr);
+       memset(cq->queue, 0, size);
+       setup.id = cq->cqid;
+       setup.base_addr = (u64) (cq->dma_addr);
+       setup.size = 1UL << cq->size_log2;
+       setup.credits = 65535;
+       setup.credit_thres = 1;
+       if (rdev_p->t3cdev_p->type == T3B)
+               setup.ovfl_mode = 0;
+       else
+               setup.ovfl_mode = 1;
+       return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+int cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+{
+       struct rdma_cq_setup setup;
+       setup.id = cq->cqid;
+       setup.base_addr = (u64) (cq->dma_addr);
+       setup.size = 1UL << cq->size_log2;
+       setup.credits = setup.size;
+       setup.credit_thres = setup.size;        /* TBD: overflow recovery */
+       setup.ovfl_mode = 1;
+       return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+static u32 get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
+{
+       struct cxio_qpid_list *entry;
+       u32 qpid;
+       int i;
+
+       mutex_lock(&uctx->lock);
+       if (!list_empty(&uctx->qpids)) {
+               entry = list_entry(uctx->qpids.next, struct cxio_qpid_list,
+                                  entry);
+               list_del(&entry->entry);
+               qpid = entry->qpid;
+               kfree(entry);
+       } else {
+               qpid = cxio_hal_get_qpid(rdev_p->rscp);
+               if (!qpid)
+                       goto out;
+               for (i = qpid+1; i & rdev_p->qpmask; i++) {
+                       entry = kmalloc(sizeof *entry, GFP_KERNEL);
+                       if (!entry)
+                               break;
+                       entry->qpid = i;
+                       list_add_tail(&entry->entry, &uctx->qpids);
+               }
+       }
+out:
+       mutex_unlock(&uctx->lock);
+       PDBG("%s qpid 0x%x\n", __FUNCTION__, qpid);
+       return qpid;
+}
+
+static void put_qpid(struct cxio_rdev *rdev_p, u32 qpid,
+                    struct cxio_ucontext *uctx)
+{
+       struct cxio_qpid_list *entry;
+
+       entry = kmalloc(sizeof *entry, GFP_KERNEL);
+       if (!entry)
+               return;
+       PDBG("%s qpid 0x%x\n", __FUNCTION__, qpid);
+       entry->qpid = qpid;
+       mutex_lock(&uctx->lock);
+       list_add_tail(&entry->entry, &uctx->qpids);
+       mutex_unlock(&uctx->lock);
+}
+
+void cxio_release_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
+{
+       struct list_head *pos, *nxt;
+       struct cxio_qpid_list *entry;
+
+       mutex_lock(&uctx->lock);
+       list_for_each_safe(pos, nxt, &uctx->qpids) {
+               entry = list_entry(pos, struct cxio_qpid_list, entry);
+               list_del_init(&entry->entry);
+               if (!(entry->qpid & rdev_p->qpmask))
+                       cxio_hal_put_qpid(rdev_p->rscp, entry->qpid);
+               kfree(entry);
+       }
+       mutex_unlock(&uctx->lock);
+}
+
+void cxio_init_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
+{
+       INIT_LIST_HEAD(&uctx->qpids);
+       mutex_init(&uctx->lock);
+}
+
+int cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain,
+                  struct t3_wq *wq, struct cxio_ucontext *uctx)
+{
+       int depth = 1UL << wq->size_log2;
+       int rqsize = 1UL << wq->rq_size_log2;
+
+       wq->qpid = get_qpid(rdev_p, uctx);
+       if (!wq->qpid)
+               return -ENOMEM;
+
+       wq->rq = kzalloc(depth * sizeof(u64), GFP_KERNEL);
+       if (!wq->rq)
+               goto err1;
+
+       wq->rq_addr = cxio_hal_rqtpool_alloc(rdev_p, rqsize);
+       if (!wq->rq_addr)
+               goto err2;
+
+       wq->sq = kzalloc(depth * sizeof(struct t3_swsq), GFP_KERNEL);
+       if (!wq->sq)
+               goto err3;
+
+       wq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev),
+                                            depth * sizeof(union t3_wr),
+                                            &(wq->dma_addr), GFP_KERNEL);
+       if (!wq->queue)
+               goto err4;
+
+       memset(wq->queue, 0, depth * sizeof(union t3_wr));
+       pci_unmap_addr_set(wq, mapping, wq->dma_addr);
+       wq->doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
+       if (!kernel_domain)
+               wq->udb = (u64)rdev_p->rnic_info.udbell_physbase +
+                                       (wq->qpid << rdev_p->qpshift);
+       PDBG("%s qpid 0x%x doorbell 0x%p udb 0x%llx\n", __FUNCTION__,
+            wq->qpid, wq->doorbell, (unsigned long long) wq->udb);
+       return 0;
+err4:
+       kfree(wq->sq);
+err3:
+       cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, rqsize);
+err2:
+       kfree(wq->rq);
+err1:
+       put_qpid(rdev_p, wq->qpid, uctx);
+       return -ENOMEM;
+}
+
+int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+{
+       int err;
+       err = cxio_hal_clear_cq_ctx(rdev_p, cq->cqid);
+       kfree(cq->sw_queue);
+       dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
+                         (1UL << (cq->size_log2))
+                         * sizeof(struct t3_cqe), cq->queue,
+                         pci_unmap_addr(cq, mapping));
+       cxio_hal_put_cqid(rdev_p->rscp, cq->cqid);
+       return err;
+}
+
+int cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq,
+                   struct cxio_ucontext *uctx)
+{
+       dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
+                         (1UL << (wq->size_log2))
+                         * sizeof(union t3_wr), wq->queue,
+                         pci_unmap_addr(wq, mapping));
+       kfree(wq->sq);
+       cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, (1UL << wq->rq_size_log2));
+       kfree(wq->rq);
+       put_qpid(rdev_p, wq->qpid, uctx);
+       return 0;
+}
+
+static void insert_recv_cqe(struct t3_wq *wq, struct t3_cq *cq)
+{
+       struct t3_cqe cqe;
+
+       PDBG("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __FUNCTION__,
+            wq, cq, cq->sw_rptr, cq->sw_wptr);
+       memset(&cqe, 0, sizeof(cqe));
+       cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) |
+                                V_CQE_OPCODE(T3_SEND) |
+                                V_CQE_TYPE(0) |
+                                V_CQE_SWCQE(1) |
+                                V_CQE_QPID(wq->qpid) |
+                                V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr,
+                                                      cq->size_log2)));
+       *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe;
+       cq->sw_wptr++;
+}
+
+void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count)
+{
+       u32 ptr;
+
+       PDBG("%s wq %p cq %p\n", __FUNCTION__, wq, cq);
+
+       /* flush RQ */
+       PDBG("%s rq_rptr %u rq_wptr %u skip count %u\n", __FUNCTION__,
+           wq->rq_rptr, wq->rq_wptr, count);
+       ptr = wq->rq_rptr + count;
+       while (ptr++ != wq->rq_wptr)
+               insert_recv_cqe(wq, cq);
+}
+
+static void insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq,
+                         struct t3_swsq *sqp)
+{
+       struct t3_cqe cqe;
+
+       PDBG("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __FUNCTION__,
+            wq, cq, cq->sw_rptr, cq->sw_wptr);
+       memset(&cqe, 0, sizeof(cqe));
+       cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) |
+                                V_CQE_OPCODE(sqp->opcode) |
+                                V_CQE_TYPE(1) |
+                                V_CQE_SWCQE(1) |
+                                V_CQE_QPID(wq->qpid) |
+                                V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr,
+                                                      cq->size_log2)));
+       cqe.u.scqe.wrid_hi = sqp->sq_wptr;
+
+       *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe;
+       cq->sw_wptr++;
+}
+
+void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count)
+{
+       __u32 ptr;
+       struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2);
+
+       ptr = wq->sq_rptr + count;
+       sqp += count;
+       while (ptr != wq->sq_wptr) {
+               insert_sq_cqe(wq, cq, sqp);
+               sqp++;
+               ptr++;
+       }
+}
+
+/*
+ * Move all CQEs from the HWCQ into the SWCQ.
+ */
+void cxio_flush_hw_cq(struct t3_cq *cq)
+{
+       struct t3_cqe *cqe, *swcqe;
+
+       PDBG("%s cq %p cqid 0x%x\n", __FUNCTION__, cq, cq->cqid);
+       cqe = cxio_next_hw_cqe(cq);
+       while (cqe) {
+               PDBG("%s flushing hwcq rptr 0x%x to swcq wptr 0x%x\n",
+                    __FUNCTION__, cq->rptr, cq->sw_wptr);
+               swcqe = cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2);
+               *swcqe = *cqe;
+               swcqe->header |= cpu_to_be32(V_CQE_SWCQE(1));
+               cq->sw_wptr++;
+               cq->rptr++;
+               cqe = cxio_next_hw_cqe(cq);
+       }
+}
+
+static inline int cqe_completes_wr(struct t3_cqe *cqe, struct t3_wq *wq)
+{
+       if (CQE_OPCODE(*cqe) == T3_TERMINATE)
+               return 0;
+
+       if ((CQE_OPCODE(*cqe) == T3_RDMA_WRITE) && RQ_TYPE(*cqe))
+               return 0;
+
+       if ((CQE_OPCODE(*cqe) == T3_READ_RESP) && SQ_TYPE(*cqe))
+               return 0;
+
+       if ((CQE_OPCODE(*cqe) == T3_SEND) && RQ_TYPE(*cqe) &&
+           Q_EMPTY(wq->rq_rptr, wq->rq_wptr))
+               return 0;
+
+       return 1;
+}
+
+void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count)
+{
+       struct t3_cqe *cqe;
+       u32 ptr;
+
+       *count = 0;
+       ptr = cq->sw_rptr;
+       while (!Q_EMPTY(ptr, cq->sw_wptr)) {
+               cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2));
+               if ((SQ_TYPE(*cqe) || (CQE_OPCODE(*cqe) == T3_READ_RESP)) &&
+                   (CQE_QPID(*cqe) == wq->qpid))
+                       (*count)++;
+               ptr++;
+       }
+       PDBG("%s cq %p count %d\n", __FUNCTION__, cq, *count);
+}
+
+void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count)
+{
+       struct t3_cqe *cqe;
+       u32 ptr;
+
+       *count = 0;
+       PDBG("%s count zero %d\n", __FUNCTION__, *count);
+       ptr = cq->sw_rptr;
+       while (!Q_EMPTY(ptr, cq->sw_wptr)) {
+               cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2));
+               if (RQ_TYPE(*cqe) && (CQE_OPCODE(*cqe) != T3_READ_RESP) &&
+                   (CQE_QPID(*cqe) == wq->qpid) && cqe_completes_wr(cqe, wq))
+                       (*count)++;
+               ptr++;
+       }
+       PDBG("%s cq %p count %d\n", __FUNCTION__, cq, *count);
+}
+
+static int cxio_hal_init_ctrl_cq(struct cxio_rdev *rdev_p)
+{
+       struct rdma_cq_setup setup;
+       setup.id = 0;
+       setup.base_addr = 0;    /* NULL address */
+       setup.size = 1;         /* enable the CQ */
+       setup.credits = 0;
+
+       /* force SGE to redirect to RspQ and interrupt */
+       setup.credit_thres = 0;
+       setup.ovfl_mode = 1;
+       return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p)
+{
+       int err;
+       u64 sge_cmd, ctx0, ctx1;
+       u64 base_addr;
+       struct t3_modify_qp_wr *wqe;
+       struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_KERNEL);
+
+
+       if (!skb) {
+               PDBG("%s alloc_skb failed\n", __FUNCTION__);
+               return -ENOMEM;
+       }
+       err = cxio_hal_init_ctrl_cq(rdev_p);
+       if (err) {
+               PDBG("%s err %d initializing ctrl_cq\n", __FUNCTION__, err);
+               return err;
+       }
+       rdev_p->ctrl_qp.workq = dma_alloc_coherent(
+                                       &(rdev_p->rnic_info.pdev->dev),
+                                       (1 << T3_CTRL_QP_SIZE_LOG2) *
+                                       sizeof(union t3_wr),
+                                       &(rdev_p->ctrl_qp.dma_addr),
+                                       GFP_KERNEL);
+       if (!rdev_p->ctrl_qp.workq) {
+               PDBG("%s dma_alloc_coherent failed\n", __FUNCTION__);
+               return -ENOMEM;
+       }
+       pci_unmap_addr_set(&rdev_p->ctrl_qp, mapping,
+                          rdev_p->ctrl_qp.dma_addr);
+       rdev_p->ctrl_qp.doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
+       memset(rdev_p->ctrl_qp.workq, 0,
+              (1 << T3_CTRL_QP_SIZE_LOG2) * sizeof(union t3_wr));
+
+       mutex_init(&rdev_p->ctrl_qp.lock);
+       init_waitqueue_head(&rdev_p->ctrl_qp.waitq);
+
+       /* update HW Ctrl QP context */
+       base_addr = rdev_p->ctrl_qp.dma_addr;
+       base_addr >>= 12;
+       ctx0 = (V_EC_SIZE((1 << T3_CTRL_QP_SIZE_LOG2)) |
+               V_EC_BASE_LO((u32) base_addr & 0xffff));
+       ctx0 <<= 32;
+       ctx0 |= V_EC_CREDITS(FW_WR_NUM);
+       base_addr >>= 16;
+       ctx1 = (u32) base_addr;
+       base_addr >>= 32;
+       ctx1 |= ((u64) (V_EC_BASE_HI((u32) base_addr & 0xf) | V_EC_RESPQ(0) |
+                       V_EC_TYPE(0) | V_EC_GEN(1) |
+                       V_EC_UP_TOKEN(T3_CTL_QP_TID) | F_EC_VALID)) << 32;
+       wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe));
+       memset(wqe, 0, sizeof(*wqe));
+       build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 0, 1,
+                      T3_CTL_QP_TID, 7);
+       wqe->flags = cpu_to_be32(MODQP_WRITE_EC);
+       sge_cmd = (3ULL << 56) | FW_RI_SGEEC_START << 8 | 3;
+       wqe->sge_cmd = cpu_to_be64(sge_cmd);
+       wqe->ctx1 = cpu_to_be64(ctx1);
+       wqe->ctx0 = cpu_to_be64(ctx0);
+       PDBG("CtrlQP dma_addr 0x%llx workq %p size %d\n",
+            (unsigned long long) rdev_p->ctrl_qp.dma_addr,
+            rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2);
+       skb->priority = CPL_PRIORITY_CONTROL;
+       return (cxgb3_ofld_send(rdev_p->t3cdev_p, skb));
+}
+
+static int cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p)
+{
+       dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
+                         (1UL << T3_CTRL_QP_SIZE_LOG2)
+                         * sizeof(union t3_wr), rdev_p->ctrl_qp.workq,
+                         pci_unmap_addr(&rdev_p->ctrl_qp, mapping));
+       return cxio_hal_clear_qp_ctx(rdev_p, T3_CTRL_QP_ID);
+}
+
+/* write len bytes of data into addr (32B aligned address)
+ * If data is NULL, clear len byte of memory to zero.
+ * caller aquires the ctrl_qp lock before the call
+ */
+static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
+                                     u32 len, void *data, int completion)
+{
+       u32 i, nr_wqe, copy_len;
+       u8 *copy_data;
+       u8 wr_len, utx_len;     /* lenght in 8 byte flit */
+       enum t3_wr_flags flag;
+       __be64 *wqe;
+       u64 utx_cmd;
+       addr &= 0x7FFFFFF;
+       nr_wqe = len % 96 ? len / 96 + 1 : len / 96;    /* 96B max per WQE */
+       PDBG("%s wptr 0x%x rptr 0x%x len %d, nr_wqe %d data %p addr 0x%0x\n",
+            __FUNCTION__, rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, len,
+            nr_wqe, data, addr);
+       utx_len = 3;            /* in 32B unit */
+       for (i = 0; i < nr_wqe; i++) {
+               if (Q_FULL(rdev_p->ctrl_qp.rptr, rdev_p->ctrl_qp.wptr,
+                          T3_CTRL_QP_SIZE_LOG2)) {
+                       PDBG("%s ctrl_qp full wtpr 0x%0x rptr 0x%0x, "
+                            "wait for more space i %d\n", __FUNCTION__,
+                            rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, i);
+                       if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
+                                            !Q_FULL(rdev_p->ctrl_qp.rptr,
+                                                    rdev_p->ctrl_qp.wptr,
+                                                    T3_CTRL_QP_SIZE_LOG2))) {
+                               PDBG("%s ctrl_qp workq interrupted\n",
+                                    __FUNCTION__);
+                               return -ERESTARTSYS;
+                       }
+                       PDBG("%s ctrl_qp wakeup, continue posting work request "
+                            "i %d\n", __FUNCTION__, i);
+               }
+               wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr %
+                                               (1 << T3_CTRL_QP_SIZE_LOG2)));
+               flag = 0;
+               if (i == (nr_wqe - 1)) {
+                       /* last WQE */
+                       flag = completion ? T3_COMPLETION_FLAG : 0;
+                       if (len % 32)
+                               utx_len = len / 32 + 1;
+                       else
+                               utx_len = len / 32;
+               }
+
+               /*
+                * Force a CQE to return the credit to the workq in case
+                * we posted more than half the max QP size of WRs
+                */
+               if ((i != 0) &&
+                   (i % (((1 << T3_CTRL_QP_SIZE_LOG2)) >> 1) == 0)) {
+                       flag = T3_COMPLETION_FLAG;
+                       PDBG("%s force completion at i %d\n", __FUNCTION__, i);
+               }
+
+               /* build the utx mem command */
+               wqe += (sizeof(struct t3_bypass_wr) >> 3);
+               utx_cmd = (T3_UTX_MEM_WRITE << 28) | (addr + i * 3);
+               utx_cmd <<= 32;
+               utx_cmd |= (utx_len << 28) | ((utx_len << 2) + 1);
+               *wqe = cpu_to_be64(utx_cmd);
+               wqe++;
+               copy_data = (u8 *) data + i * 96;
+               copy_len = len > 96 ? 96 : len;
+
+               /* clear memory content if data is NULL */
+               if (data)
+                       memcpy(wqe, copy_data, copy_len);
+               else
+                       memset(wqe, 0, copy_len);
+               if (copy_len % 32)
+                       memset(((u8 *) wqe) + copy_len, 0,
+                              32 - (copy_len % 32));
+               wr_len = ((sizeof(struct t3_bypass_wr)) >> 3) + 1 +
+                        (utx_len << 2);
+               wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr %
+                             (1 << T3_CTRL_QP_SIZE_LOG2)));
+
+               /* wptr in the WRID[31:0] */
+               ((union t3_wrid *)(wqe+1))->id0.low = rdev_p->ctrl_qp.wptr;
+
+               /*
+                * This must be the last write with a memory barrier
+                * for the genbit
+                */
+               build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_BP, flag,
+                              Q_GENBIT(rdev_p->ctrl_qp.wptr,
+                                       T3_CTRL_QP_SIZE_LOG2), T3_CTRL_QP_ID,
+                              wr_len);
+               if (flag == T3_COMPLETION_FLAG)
+                       ring_doorbell(rdev_p->ctrl_qp.doorbell, T3_CTRL_QP_ID);
+               len -= 96;
+               rdev_p->ctrl_qp.wptr++;
+       }
+       return 0;
+}
+
+/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl, and pbl_size
+ * OUT: stag index, actual pbl_size, pbl_addr allocated.
+ * TBD: shared memory region support
+ */
+static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry,
+                        u32 *stag, u8 stag_state, u32 pdid,
+                        enum tpt_mem_type type, enum tpt_mem_perm perm,
+                        u32 zbva, u64 to, u32 len, u8 page_size, __be64 *pbl,
+                        u32 *pbl_size, u32 *pbl_addr)
+{
+       int err;
+       struct tpt_entry tpt;
+       u32 stag_idx;
+       u32 wptr;
+       int rereg = (*stag != T3_STAG_UNSET);
+
+       stag_state = stag_state > 0;
+       stag_idx = (*stag) >> 8;
+
+       if ((!reset_tpt_entry) && !(*stag != T3_STAG_UNSET)) {
+               stag_idx = cxio_hal_get_stag(rdev_p->rscp);
+               if (!stag_idx)
+                       return -ENOMEM;
+               *stag = (stag_idx << 8) | ((*stag) & 0xFF);
+       }
+       PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n",
+            __FUNCTION__, stag_state, type, pdid, stag_idx);
+
+       if (reset_tpt_entry)
+               cxio_hal_pblpool_free(rdev_p, *pbl_addr, *pbl_size << 3);
+       else if (!rereg) {
+               *pbl_addr = cxio_hal_pblpool_alloc(rdev_p, *pbl_size << 3);
+               if (!*pbl_addr) {
+                       return -ENOMEM;
+               }
+       }
+
+       mutex_lock(&rdev_p->ctrl_qp.lock);
+
+       /* write PBL first if any - update pbl only if pbl list exist */
+       if (pbl) {
+
+               PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
+                    __FUNCTION__, *pbl_addr, rdev_p->rnic_info.pbl_base,
+                    *pbl_size);
+               err = cxio_hal_ctrl_qp_write_mem(rdev_p,
+                               (*pbl_addr >> 5),
+                               (*pbl_size << 3), pbl, 0);
+               if (err)
+                       goto ret;
+       }
+
+       /* write TPT entry */
+       if (reset_tpt_entry)
+               memset(&tpt, 0, sizeof(tpt));
+       else {
+               tpt.valid_stag_pdid = cpu_to_be32(F_TPT_VALID |
+                               V_TPT_STAG_KEY((*stag) & M_TPT_STAG_KEY) |
+                               V_TPT_STAG_STATE(stag_state) |
+                               V_TPT_STAG_TYPE(type) | V_TPT_PDID(pdid));
+               BUG_ON(page_size >= 28);
+               tpt.flags_pagesize_qpid = cpu_to_be32(V_TPT_PERM(perm) |
+                               F_TPT_MW_BIND_ENABLE |
+                               V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) |
+                               V_TPT_PAGE_SIZE(page_size));
+               tpt.rsvd_pbl_addr = reset_tpt_entry ? 0 :
+                                   cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, *pbl_addr)>>3));
+               tpt.len = cpu_to_be32(len);
+               tpt.va_hi = cpu_to_be32((u32) (to >> 32));
+               tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL));
+               tpt.rsvd_bind_cnt_or_pstag = 0;
+               tpt.rsvd_pbl_size = reset_tpt_entry ? 0 :
+                                 cpu_to_be32(V_TPT_PBL_SIZE((*pbl_size) >> 2));
+       }
+       err = cxio_hal_ctrl_qp_write_mem(rdev_p,
+                                      stag_idx +
+                                      (rdev_p->rnic_info.tpt_base >> 5),
+                                      sizeof(tpt), &tpt, 1);
+
+       /* release the stag index to free pool */
+       if (reset_tpt_entry)
+               cxio_hal_put_stag(rdev_p->rscp, stag_idx);
+ret:
+       wptr = rdev_p->ctrl_qp.wptr;
+       mutex_unlock(&rdev_p->ctrl_qp.lock);
+       if (!err)
+               if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
+                                            SEQ32_GE(rdev_p->ctrl_qp.rptr,
+                                                     wptr)))
+                       return -ERESTARTSYS;
+       return err;
+}
+
+/* IN : stag key, pdid, pbl_size
+ * Out: stag index, actaul pbl_size, and pbl_addr allocated.
+ */
+int cxio_allocate_stag(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid,
+                      enum tpt_mem_perm perm, u32 * pbl_size, u32 * pbl_addr)
+{
+       *stag = T3_STAG_UNSET;
+       return (__cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_NON_SHARED_MR,
+                             perm, 0, 0ULL, 0, 0, NULL, pbl_size, pbl_addr));
+}
+
+int cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
+                          enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+                          u8 page_size, __be64 *pbl, u32 *pbl_size,
+                          u32 *pbl_addr)
+{
+       *stag = T3_STAG_UNSET;
+       return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
+                            zbva, to, len, page_size, pbl, pbl_size, pbl_addr);
+}
+
+int cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
+                          enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+                          u8 page_size, __be64 *pbl, u32 *pbl_size,
+                          u32 *pbl_addr)
+{
+       return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
+                            zbva, to, len, page_size, pbl, pbl_size, pbl_addr);
+}
+
+int cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size,
+                  u32 pbl_addr)
+{
+       return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL,
+                            &pbl_size, &pbl_addr);
+}
+
+int cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid)
+{
+       u32 pbl_size = 0;
+       *stag = T3_STAG_UNSET;
+       return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0,
+                            NULL, &pbl_size, NULL);
+}
+
+int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag)
+{
+       return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0, NULL,
+                            NULL, NULL);
+}
+
+int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr)
+{
+       struct t3_rdma_init_wr *wqe;
+       struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_ATOMIC);
+       if (!skb)
+               return -ENOMEM;
+       PDBG("%s rdev_p %p\n", __FUNCTION__, rdev_p);
+       wqe = (struct t3_rdma_init_wr *) __skb_put(skb, sizeof(*wqe));
+       wqe->wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_INIT));
+       wqe->wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(attr->tid) |
+                                          V_FW_RIWR_LEN(sizeof(*wqe) >> 3));
+       wqe->wrid.id1 = 0;
+       wqe->qpid = cpu_to_be32(attr->qpid);
+       wqe->pdid = cpu_to_be32(attr->pdid);
+       wqe->scqid = cpu_to_be32(attr->scqid);
+       wqe->rcqid = cpu_to_be32(attr->rcqid);
+       wqe->rq_addr = cpu_to_be32(attr->rq_addr - rdev_p->rnic_info.rqt_base);
+       wqe->rq_size = cpu_to_be32(attr->rq_size);
+       wqe->mpaattrs = attr->mpaattrs;
+       wqe->qpcaps = attr->qpcaps;
+       wqe->ulpdu_size = cpu_to_be16(attr->tcp_emss);
+       wqe->flags = cpu_to_be32(attr->flags);
+       wqe->ord = cpu_to_be32(attr->ord);
+       wqe->ird = cpu_to_be32(attr->ird);
+       wqe->qp_dma_addr = cpu_to_be64(attr->qp_dma_addr);
+       wqe->qp_dma_size = cpu_to_be32(attr->qp_dma_size);
+       wqe->rsvd = 0;
+       skb->priority = 0;      /* 0=>ToeQ; 1=>CtrlQ */
+       return (cxgb3_ofld_send(rdev_p->t3cdev_p, skb));
+}
+
+void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb)
+{
+       cxio_ev_cb = ev_cb;
+}
+
+void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb)
+{
+       cxio_ev_cb = NULL;
+}
+
+static int cxio_hal_ev_handler(struct t3cdev *t3cdev_p, struct sk_buff *skb)
+{
+       static int cnt;
+       struct cxio_rdev *rdev_p = NULL;
+       struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data;
+       PDBG("%d: %s cq_id 0x%x cq_ptr 0x%x genbit %0x overflow %0x an %0x"
+            " se %0x notify %0x cqbranch %0x creditth %0x\n",
+            cnt, __FUNCTION__, RSPQ_CQID(rsp_msg), RSPQ_CQPTR(rsp_msg),
+            RSPQ_GENBIT(rsp_msg), RSPQ_OVERFLOW(rsp_msg), RSPQ_AN(rsp_msg),
+            RSPQ_SE(rsp_msg), RSPQ_NOTIFY(rsp_msg), RSPQ_CQBRANCH(rsp_msg),
+            RSPQ_CREDIT_THRESH(rsp_msg));
+       PDBG("CQE: QPID 0x%0x genbit %0x type 0x%0x status 0x%0x opcode %d "
+            "len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
+            CQE_QPID(rsp_msg->cqe), CQE_GENBIT(rsp_msg->cqe),
+            CQE_TYPE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
+            CQE_OPCODE(rsp_msg->cqe), CQE_LEN(rsp_msg->cqe),
+            CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
+       rdev_p = (struct cxio_rdev *)t3cdev_p->ulp;
+       if (!rdev_p) {
+               PDBG("%s called by t3cdev %p with null ulp\n", __FUNCTION__,
+                    t3cdev_p);
+               return 0;
+       }
+       if (CQE_QPID(rsp_msg->cqe) == T3_CTRL_QP_ID) {
+               rdev_p->ctrl_qp.rptr = CQE_WRID_LOW(rsp_msg->cqe) + 1;
+               wake_up_interruptible(&rdev_p->ctrl_qp.waitq);
+               dev_kfree_skb_irq(skb);
+       } else if (CQE_QPID(rsp_msg->cqe) == 0xfff8)
+               dev_kfree_skb_irq(skb);
+       else if (cxio_ev_cb)
+               (*cxio_ev_cb) (rdev_p, skb);
+       else
+               dev_kfree_skb_irq(skb);
+       cnt++;
+       return 0;
+}
+
+/* Caller takes care of locking if needed */
+int cxio_rdev_open(struct cxio_rdev *rdev_p)
+{
+       struct net_device *netdev_p = NULL;
+       int err = 0;
+       if (strlen(rdev_p->dev_name)) {
+               if (cxio_hal_find_rdev_by_name(rdev_p->dev_name)) {
+                       return -EBUSY;
+               }
+               netdev_p = dev_get_by_name(rdev_p->dev_name);
+               if (!netdev_p) {
+                       return -EINVAL;
+               }
+               dev_put(netdev_p);
+       } else if (rdev_p->t3cdev_p) {
+               if (cxio_hal_find_rdev_by_t3cdev(rdev_p->t3cdev_p)) {
+                       return -EBUSY;
+               }
+               netdev_p = rdev_p->t3cdev_p->lldev;
+               strncpy(rdev_p->dev_name, rdev_p->t3cdev_p->name,
+                       T3_MAX_DEV_NAME_LEN);
+       } else {
+               PDBG("%s t3cdev_p or dev_name must be set\n", __FUNCTION__);
+               return -EINVAL;
+       }
+
+       list_add_tail(&rdev_p->entry, &rdev_list);
+
+       PDBG("%s opening rnic dev %s\n", __FUNCTION__, rdev_p->dev_name);
+       memset(&rdev_p->ctrl_qp, 0, sizeof(rdev_p->ctrl_qp));
+       if (!rdev_p->t3cdev_p)
+               rdev_p->t3cdev_p = T3CDEV(netdev_p);
+       rdev_p->t3cdev_p->ulp = (void *) rdev_p;
+       err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_GET_PARAMS,
+                                        &(rdev_p->rnic_info));
+       if (err) {
+               printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n",
+                    __FUNCTION__, rdev_p->t3cdev_p, err);
+               goto err1;
+       }
+       err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_PORTS,
+                                   &(rdev_p->port_info));
+       if (err) {
+               printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n",
+                    __FUNCTION__, rdev_p->t3cdev_p, err);
+               goto err1;
+       }
+
+       /*
+        * qpshift is the number of bits to shift the qpid left in order
+        * to get the correct address of the doorbell for that qp.
+        */
+       cxio_init_ucontext(rdev_p, &rdev_p->uctx);
+       rdev_p->qpshift = PAGE_SHIFT -
+                         ilog2(65536 >>
+                                   ilog2(rdev_p->rnic_info.udbell_len >>
+                                             PAGE_SHIFT));
+       rdev_p->qpnr = rdev_p->rnic_info.udbell_len >> PAGE_SHIFT;
+       rdev_p->qpmask = (65536 >> ilog2(rdev_p->qpnr)) - 1;
+       PDBG("%s rnic %s info: tpt_base 0x%0x tpt_top 0x%0x num stags %d "
+            "pbl_base 0x%0x pbl_top 0x%0x rqt_base 0x%0x, rqt_top 0x%0x\n",
+            __FUNCTION__, rdev_p->dev_name, rdev_p->rnic_info.tpt_base,
+            rdev_p->rnic_info.tpt_top, cxio_num_stags(rdev_p),
+            rdev_p->rnic_info.pbl_base,
+            rdev_p->rnic_info.pbl_top, rdev_p->rnic_info.rqt_base,
+            rdev_p->rnic_info.rqt_top);
+       PDBG("udbell_len 0x%0x udbell_physbase 0x%lx kdb_addr %p qpshift %lu "
+            "qpnr %d qpmask 0x%x\n",
+            rdev_p->rnic_info.udbell_len,
+            rdev_p->rnic_info.udbell_physbase, rdev_p->rnic_info.kdb_addr,
+            rdev_p->qpshift, rdev_p->qpnr, rdev_p->qpmask);
+
+       err = cxio_hal_init_ctrl_qp(rdev_p);
+       if (err) {
+               printk(KERN_ERR "%s error %d initializing ctrl_qp.\n",
+                      __FUNCTION__, err);
+               goto err1;
+       }
+       err = cxio_hal_init_resource(rdev_p, cxio_num_stags(rdev_p), 0,
+                                    0, T3_MAX_NUM_QP, T3_MAX_NUM_CQ,
+                                    T3_MAX_NUM_PD);
+       if (err) {
+               printk(KERN_ERR "%s error %d initializing hal resources.\n",
+                      __FUNCTION__, err);
+               goto err2;
+       }
+       err = cxio_hal_pblpool_create(rdev_p);
+       if (err) {
+               printk(KERN_ERR "%s error %d initializing pbl mem pool.\n",
+                      __FUNCTION__, err);
+               goto err3;
+       }
+       err = cxio_hal_rqtpool_create(rdev_p);
+       if (err) {
+               printk(KERN_ERR "%s error %d initializing rqt mem pool.\n",
+                      __FUNCTION__, err);
+               goto err4;
+       }
+       return 0;
+err4:
+       cxio_hal_pblpool_destroy(rdev_p);
+err3:
+       cxio_hal_destroy_resource(rdev_p->rscp);
+err2:
+       cxio_hal_destroy_ctrl_qp(rdev_p);
+err1:
+       list_del(&rdev_p->entry);
+       return err;
+}
+
+void cxio_rdev_close(struct cxio_rdev *rdev_p)
+{
+       if (rdev_p) {
+               cxio_hal_pblpool_destroy(rdev_p);
+               cxio_hal_rqtpool_destroy(rdev_p);
+               list_del(&rdev_p->entry);
+               rdev_p->t3cdev_p->ulp = NULL;
+               cxio_hal_destroy_ctrl_qp(rdev_p);
+               cxio_hal_destroy_resource(rdev_p->rscp);
+       }
+}
+
+int __init cxio_hal_init(void)
+{
+       if (cxio_hal_init_rhdl_resource(T3_MAX_NUM_RI))
+               return -ENOMEM;
+       t3_register_cpl_handler(CPL_ASYNC_NOTIF, cxio_hal_ev_handler);
+       return 0;
+}
+
+void __exit cxio_hal_exit(void)
+{
+       struct cxio_rdev *rdev, *tmp;
+
+       t3_register_cpl_handler(CPL_ASYNC_NOTIF, NULL);
+       list_for_each_entry_safe(rdev, tmp, &rdev_list, entry)
+               cxio_rdev_close(rdev);
+       cxio_hal_destroy_rhdl_resource();
+}
+
+static inline void flush_completed_wrs(struct t3_wq *wq, struct t3_cq *cq)
+{
+       struct t3_swsq *sqp;
+       __u32 ptr = wq->sq_rptr;
+       int count = Q_COUNT(wq->sq_rptr, wq->sq_wptr);
+
+       sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
+       while (count--)
+               if (!sqp->signaled) {
+                       ptr++;
+                       sqp = wq->sq + Q_PTR2IDX(ptr,  wq->sq_size_log2);
+               } else if (sqp->complete) {
+
+                       /*
+                        * Insert this completed cqe into the swcq.
+                        */
+                       PDBG("%s moving cqe into swcq sq idx %ld cq idx %ld\n",
+                            __FUNCTION__, Q_PTR2IDX(ptr,  wq->sq_size_log2),
+                            Q_PTR2IDX(cq->sw_wptr, cq->size_log2));
+                       sqp->cqe.header |= htonl(V_CQE_SWCQE(1));
+                       *(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2))
+                               = sqp->cqe;
+                       cq->sw_wptr++;
+                       sqp->signaled = 0;
+                       break;
+               } else
+                       break;
+}
+
+static inline void create_read_req_cqe(struct t3_wq *wq,
+                                      struct t3_cqe *hw_cqe,
+                                      struct t3_cqe *read_cqe)
+{
+       read_cqe->u.scqe.wrid_hi = wq->oldest_read->sq_wptr;
+       read_cqe->len = wq->oldest_read->read_len;
+       read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(*hw_cqe)) |
+                                V_CQE_SWCQE(SW_CQE(*hw_cqe)) |
+                                V_CQE_OPCODE(T3_READ_REQ) |
+                                V_CQE_TYPE(1));
+}
+
+/*
+ * Return a ptr to the next read wr in the SWSQ or NULL.
+ */
+static inline void advance_oldest_read(struct t3_wq *wq)
+{
+
+       u32 rptr = wq->oldest_read - wq->sq + 1;
+       u32 wptr = Q_PTR2IDX(wq->sq_wptr, wq->sq_size_log2);
+
+       while (Q_PTR2IDX(rptr, wq->sq_size_log2) != wptr) {
+               wq->oldest_read = wq->sq + Q_PTR2IDX(rptr, wq->sq_size_log2);
+
+               if (wq->oldest_read->opcode == T3_READ_REQ)
+                       return;
+               rptr++;
+       }
+       wq->oldest_read = NULL;
+}
+
+/*
+ * cxio_poll_cq
+ *
+ * Caller must:
+ *     check the validity of the first CQE,
+ *     supply the wq assicated with the qpid.
+ *
+ * credit: cq credit to return to sge.
+ * cqe_flushed: 1 iff the CQE is flushed.
+ * cqe: copy of the polled CQE.
+ *
+ * return value:
+ *     0       CQE returned,
+ *    -1       CQE skipped, try again.
+ */
+int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe,
+                    u8 *cqe_flushed, u64 *cookie, u32 *credit)
+{
+       int ret = 0;
+       struct t3_cqe *hw_cqe, read_cqe;
+
+       *cqe_flushed = 0;
+       *credit = 0;
+       hw_cqe = cxio_next_cqe(cq);
+
+       PDBG("%s CQE OOO %d qpid 0x%0x genbit %d type %d status 0x%0x"
+            " opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
+            __FUNCTION__, CQE_OOO(*hw_cqe), CQE_QPID(*hw_cqe),
+            CQE_GENBIT(*hw_cqe), CQE_TYPE(*hw_cqe), CQE_STATUS(*hw_cqe),
+            CQE_OPCODE(*hw_cqe), CQE_LEN(*hw_cqe), CQE_WRID_HI(*hw_cqe),
+            CQE_WRID_LOW(*hw_cqe));
+
+       /*
+        * skip cqe's not affiliated with a QP.
+        */
+       if (wq == NULL) {
+               ret = -1;
+               goto skip_cqe;
+       }
+
+       /*
+        * Gotta tweak READ completions:
+        *      1) the cqe doesn't contain the sq_wptr from the wr.
+        *      2) opcode not reflected from the wr.
+        *      3) read_len not reflected from the wr.
+        *      4) cq_type is RQ_TYPE not SQ_TYPE.
+        */
+       if (RQ_TYPE(*hw_cqe) && (CQE_OPCODE(*hw_cqe) == T3_READ_RESP)) {
+
+               /*
+                * Don't write to the HWCQ, so create a new read req CQE
+                * in local memory.
+                */
+               create_read_req_cqe(wq, hw_cqe, &read_cqe);
+               hw_cqe = &read_cqe;
+               advance_oldest_read(wq);
+       }
+
+       /*
+        * T3A: Discard TERMINATE CQEs.
+        */
+       if (CQE_OPCODE(*hw_cqe) == T3_TERMINATE) {
+               ret = -1;
+               wq->error = 1;
+               goto skip_cqe;
+       }
+
+       if (CQE_STATUS(*hw_cqe) || wq->error) {
+               *cqe_flushed = wq->error;
+               wq->error = 1;
+
+               /*
+                * T3A inserts errors into the CQE.  We cannot return
+                * these as work completions.
+                */
+               /* incoming write failures */
+               if ((CQE_OPCODE(*hw_cqe) == T3_RDMA_WRITE)
+                    && RQ_TYPE(*hw_cqe)) {
+                       ret = -1;
+                       goto skip_cqe;
+               }
+               /* incoming read request failures */
+               if ((CQE_OPCODE(*hw_cqe) == T3_READ_RESP) && SQ_TYPE(*hw_cqe)) {
+                       ret = -1;
+                       goto skip_cqe;
+               }
+
+               /* incoming SEND with no receive posted failures */
+               if ((CQE_OPCODE(*hw_cqe) == T3_SEND) && RQ_TYPE(*hw_cqe) &&
+                   Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) {
+                       ret = -1;
+                       goto skip_cqe;
+               }
+               goto proc_cqe;
+       }
+
+       /*
+        * RECV completion.
+        */
+       if (RQ_TYPE(*hw_cqe)) {
+
+               /*
+                * HW only validates 4 bits of MSN.  So we must validate that
+                * the MSN in the SEND is the next expected MSN.  If its not,
+                * then we complete this with TPT_ERR_MSN and mark the wq in
+                * error.
+                */
+               if (unlikely((CQE_WRID_MSN(*hw_cqe) != (wq->rq_rptr + 1)))) {
+                       wq->error = 1;
+                       hw_cqe->header |= htonl(V_CQE_STATUS(TPT_ERR_MSN));
+                       goto proc_cqe;
+               }
+               goto proc_cqe;
+       }
+
+       /*
+        * If we get here its a send completion.
+        *
+        * Handle out of order completion. These get stuffed
+        * in the SW SQ. Then the SW SQ is walked to move any
+        * now in-order completions into the SW CQ.  This handles
+        * 2 cases:
+        *      1) reaping unsignaled WRs when the first subsequent
+        *         signaled WR is completed.
+        *      2) out of order read completions.
+        */
+       if (!SW_CQE(*hw_cqe) && (CQE_WRID_SQ_WPTR(*hw_cqe) != wq->sq_rptr)) {
+               struct t3_swsq *sqp;
+
+               PDBG("%s out of order completion going in swsq at idx %ld\n",
+                    __FUNCTION__,
+                    Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2));
+               sqp = wq->sq +
+                     Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2);
+               sqp->cqe = *hw_cqe;
+               sqp->complete = 1;
+               ret = -1;
+               goto flush_wq;
+       }
+
+proc_cqe:
+       *cqe = *hw_cqe;
+
+       /*
+        * Reap the associated WR(s) that are freed up with this
+        * completion.
+        */
+       if (SQ_TYPE(*hw_cqe)) {
+               wq->sq_rptr = CQE_WRID_SQ_WPTR(*hw_cqe);
+               PDBG("%s completing sq idx %ld\n", __FUNCTION__,
+                    Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2));
+               *cookie = (wq->sq +
+                          Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2))->wr_id;
+               wq->sq_rptr++;
+       } else {
+               PDBG("%s completing rq idx %ld\n", __FUNCTION__,
+                    Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2));
+               *cookie = *(wq->rq + Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2));
+               wq->rq_rptr++;
+       }
+
+flush_wq:
+       /*
+        * Flush any completed cqes that are now in-order.
+        */
+       flush_completed_wrs(wq, cq);
+
+skip_cqe:
+       if (SW_CQE(*hw_cqe)) {
+               PDBG("%s cq %p cqid 0x%x skip sw cqe sw_rptr 0x%x\n",
+                    __FUNCTION__, cq, cq->cqid, cq->sw_rptr);
+               ++cq->sw_rptr;
+       } else {
+               PDBG("%s cq %p cqid 0x%x skip hw cqe rptr 0x%x\n",
+                    __FUNCTION__, cq, cq->cqid, cq->rptr);
+               ++cq->rptr;
+
+               /*
+                * T3A: compute credits.
+                */
+               if (((cq->rptr - cq->wptr) > (1 << (cq->size_log2 - 1)))
+                   || ((cq->rptr - cq->wptr) >= 128)) {
+                       *credit = cq->rptr - cq->wptr;
+                       cq->wptr = cq->rptr;
+               }
+       }
+       return ret;
+}
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h
new file mode 100644 (file)
index 0000000..1b97e80
--- /dev/null
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef  __CXIO_HAL_H__
+#define  __CXIO_HAL_H__
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+
+#include "t3_cpl.h"
+#include "t3cdev.h"
+#include "cxgb3_ctl_defs.h"
+#include "cxio_wr.h"
+
+#define T3_CTRL_QP_ID    FW_RI_SGEEC_START
+#define T3_CTL_QP_TID   FW_RI_TID_START
+#define T3_CTRL_QP_SIZE_LOG2  8
+#define T3_CTRL_CQ_ID    0
+
+/* TBD */
+#define T3_MAX_NUM_RI (1<<15)
+#define T3_MAX_NUM_QP (1<<15)
+#define T3_MAX_NUM_CQ (1<<15)
+#define T3_MAX_NUM_PD (1<<15)
+#define T3_MAX_PBL_SIZE 256
+#define T3_MAX_RQ_SIZE 1024
+#define T3_MAX_NUM_STAG (1<<15)
+
+#define T3_STAG_UNSET 0xffffffff
+
+#define T3_MAX_DEV_NAME_LEN 32
+
+struct cxio_hal_ctrl_qp {
+       u32 wptr;
+       u32 rptr;
+       struct mutex lock;      /* for the wtpr, can sleep */
+       wait_queue_head_t waitq;/* wait for RspQ/CQE msg */
+       union t3_wr *workq;     /* the work request queue */
+       dma_addr_t dma_addr;    /* pci bus address of the workq */
+       DECLARE_PCI_UNMAP_ADDR(mapping)
+       void __iomem *doorbell;
+};
+
+struct cxio_hal_resource {
+       struct kfifo *tpt_fifo;
+       spinlock_t tpt_fifo_lock;
+       struct kfifo *qpid_fifo;
+       spinlock_t qpid_fifo_lock;
+       struct kfifo *cqid_fifo;
+       spinlock_t cqid_fifo_lock;
+       struct kfifo *pdid_fifo;
+       spinlock_t pdid_fifo_lock;
+};
+
+struct cxio_qpid_list {
+       struct list_head entry;
+       u32 qpid;
+};
+
+struct cxio_ucontext {
+       struct list_head qpids;
+       struct mutex lock;
+};
+
+struct cxio_rdev {
+       char dev_name[T3_MAX_DEV_NAME_LEN];
+       struct t3cdev *t3cdev_p;
+       struct rdma_info rnic_info;
+       struct adap_ports port_info;
+       struct cxio_hal_resource *rscp;
+       struct cxio_hal_ctrl_qp ctrl_qp;
+       void *ulp;
+       unsigned long qpshift;
+       u32 qpnr;
+       u32 qpmask;
+       struct cxio_ucontext uctx;
+       struct gen_pool *pbl_pool;
+       struct gen_pool *rqt_pool;
+       struct list_head entry;
+};
+
+static inline int cxio_num_stags(struct cxio_rdev *rdev_p)
+{
+       return min((int)T3_MAX_NUM_STAG, (int)((rdev_p->rnic_info.tpt_top - rdev_p->rnic_info.tpt_base) >> 5));
+}
+
+typedef void (*cxio_hal_ev_callback_func_t) (struct cxio_rdev * rdev_p,
+                                            struct sk_buff * skb);
+
+#define RSPQ_CQID(rsp) (be32_to_cpu(rsp->cq_ptrid) & 0xffff)
+#define RSPQ_CQPTR(rsp) ((be32_to_cpu(rsp->cq_ptrid) >> 16) & 0xffff)
+#define RSPQ_GENBIT(rsp) ((be32_to_cpu(rsp->flags) >> 16) & 1)
+#define RSPQ_OVERFLOW(rsp) ((be32_to_cpu(rsp->flags) >> 17) & 1)
+#define RSPQ_AN(rsp) ((be32_to_cpu(rsp->flags) >> 18) & 1)
+#define RSPQ_SE(rsp) ((be32_to_cpu(rsp->flags) >> 19) & 1)
+#define RSPQ_NOTIFY(rsp) ((be32_to_cpu(rsp->flags) >> 20) & 1)
+#define RSPQ_CQBRANCH(rsp) ((be32_to_cpu(rsp->flags) >> 21) & 1)
+#define RSPQ_CREDIT_THRESH(rsp) ((be32_to_cpu(rsp->flags) >> 22) & 1)
+
+struct respQ_msg_t {
+       __be32 flags;           /* flit 0 */
+       __be32 cq_ptrid;
+       __be64 rsvd;            /* flit 1 */
+       struct t3_cqe cqe;      /* flits 2-3 */
+};
+
+enum t3_cq_opcode {
+       CQ_ARM_AN = 0x2,
+       CQ_ARM_SE = 0x6,
+       CQ_FORCE_AN = 0x3,
+       CQ_CREDIT_UPDATE = 0x7
+};
+
+int cxio_rdev_open(struct cxio_rdev *rdev);
+void cxio_rdev_close(struct cxio_rdev *rdev);
+int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq,
+                  enum t3_cq_opcode op, u32 credit);
+int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev, u32 qpid);
+int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
+void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
+int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq,
+                  struct cxio_ucontext *uctx);
+int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq,
+                   struct cxio_ucontext *uctx);
+int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode);
+int cxio_allocate_stag(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
+                      enum tpt_mem_perm perm, u32 * pbl_size, u32 * pbl_addr);
+int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
+                          enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+                          u8 page_size, __be64 *pbl, u32 *pbl_size,
+                          u32 *pbl_addr);
+int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
+                          enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+                          u8 page_size, __be64 *pbl, u32 *pbl_size,
+                          u32 *pbl_addr);
+int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size,
+                  u32 pbl_addr);
+int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid);
+int cxio_deallocate_window(struct cxio_rdev *rdev, u32 stag);
+int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr);
+void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
+void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
+u32 cxio_hal_get_rhdl(void);
+void cxio_hal_put_rhdl(u32 rhdl);
+u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp);
+void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid);
+int __init cxio_hal_init(void);
+void __exit cxio_hal_exit(void);
+void cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count);
+void cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count);
+void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
+void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
+void cxio_flush_hw_cq(struct t3_cq *cq);
+int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe,
+                    u8 *cqe_flushed, u64 *cookie, u32 *credit);
+
+#define MOD "iw_cxgb3: "
+#define PDBG(fmt, args...) pr_debug(MOD fmt, ## args)
+
+#ifdef DEBUG
+void cxio_dump_tpt(struct cxio_rdev *rev, u32 stag);
+void cxio_dump_pbl(struct cxio_rdev *rev, u32 pbl_addr, uint len, u8 shift);
+void cxio_dump_wqe(union t3_wr *wqe);
+void cxio_dump_wce(struct t3_cqe *wce);
+void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents);
+void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid);
+#endif
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c
new file mode 100644 (file)
index 0000000..997aa32
--- /dev/null
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/* Crude resource management */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/kfifo.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include "cxio_resource.h"
+#include "cxio_hal.h"
+
+static struct kfifo *rhdl_fifo;
+static spinlock_t rhdl_fifo_lock;
+
+#define RANDOM_SIZE 16
+
+static int __cxio_init_resource_fifo(struct kfifo **fifo,
+                                  spinlock_t *fifo_lock,
+                                  u32 nr, u32 skip_low,
+                                  u32 skip_high,
+                                  int random)
+{
+       u32 i, j, entry = 0, idx;
+       u32 random_bytes;
+       u32 rarray[16];
+       spin_lock_init(fifo_lock);
+
+       *fifo = kfifo_alloc(nr * sizeof(u32), GFP_KERNEL, fifo_lock);
+       if (IS_ERR(*fifo))
+               return -ENOMEM;
+
+       for (i = 0; i < skip_low + skip_high; i++)
+               __kfifo_put(*fifo, (unsigned char *) &entry, sizeof(u32));
+       if (random) {
+               j = 0;
+               random_bytes = random32();
+               for (i = 0; i < RANDOM_SIZE; i++)
+                       rarray[i] = i + skip_low;
+               for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) {
+                       if (j >= RANDOM_SIZE) {
+                               j = 0;
+                               random_bytes = random32();
+                       }
+                       idx = (random_bytes >> (j * 2)) & 0xF;
+                       __kfifo_put(*fifo,
+                               (unsigned char *) &rarray[idx],
+                               sizeof(u32));
+                       rarray[idx] = i;
+                       j++;
+               }
+               for (i = 0; i < RANDOM_SIZE; i++)
+                       __kfifo_put(*fifo,
+                               (unsigned char *) &rarray[i],
+                               sizeof(u32));
+       } else
+               for (i = skip_low; i < nr - skip_high; i++)
+                       __kfifo_put(*fifo, (unsigned char *) &i, sizeof(u32));
+
+       for (i = 0; i < skip_low + skip_high; i++)
+               kfifo_get(*fifo, (unsigned char *) &entry, sizeof(u32));
+       return 0;
+}
+
+static int cxio_init_resource_fifo(struct kfifo **fifo, spinlock_t * fifo_lock,
+                                  u32 nr, u32 skip_low, u32 skip_high)
+{
+       return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
+                                         skip_high, 0));
+}
+
+static int cxio_init_resource_fifo_random(struct kfifo **fifo,
+                                  spinlock_t * fifo_lock,
+                                  u32 nr, u32 skip_low, u32 skip_high)
+{
+
+       return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
+                                         skip_high, 1));
+}
+
+static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p)
+{
+       u32 i;
+
+       spin_lock_init(&rdev_p->rscp->qpid_fifo_lock);
+
+       rdev_p->rscp->qpid_fifo = kfifo_alloc(T3_MAX_NUM_QP * sizeof(u32),
+                                             GFP_KERNEL,
+                                             &rdev_p->rscp->qpid_fifo_lock);
+       if (IS_ERR(rdev_p->rscp->qpid_fifo))
+               return -ENOMEM;
+
+       for (i = 16; i < T3_MAX_NUM_QP; i++)
+               if (!(i & rdev_p->qpmask))
+                       __kfifo_put(rdev_p->rscp->qpid_fifo,
+                                   (unsigned char *) &i, sizeof(u32));
+       return 0;
+}
+
+int cxio_hal_init_rhdl_resource(u32 nr_rhdl)
+{
+       return cxio_init_resource_fifo(&rhdl_fifo, &rhdl_fifo_lock, nr_rhdl, 1,
+                                      0);
+}
+
+void cxio_hal_destroy_rhdl_resource(void)
+{
+       kfifo_free(rhdl_fifo);
+}
+
+/* nr_* must be power of 2 */
+int cxio_hal_init_resource(struct cxio_rdev *rdev_p,
+                          u32 nr_tpt, u32 nr_pbl,
+                          u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, u32 nr_pdid)
+{
+       int err = 0;
+       struct cxio_hal_resource *rscp;
+
+       rscp = kmalloc(sizeof(*rscp), GFP_KERNEL);
+       if (!rscp)
+               return -ENOMEM;
+       rdev_p->rscp = rscp;
+       err = cxio_init_resource_fifo_random(&rscp->tpt_fifo,
+                                     &rscp->tpt_fifo_lock,
+                                     nr_tpt, 1, 0);
+       if (err)
+               goto tpt_err;
+       err = cxio_init_qpid_fifo(rdev_p);
+       if (err)
+               goto qpid_err;
+       err = cxio_init_resource_fifo(&rscp->cqid_fifo, &rscp->cqid_fifo_lock,
+                                     nr_cqid, 1, 0);
+       if (err)
+               goto cqid_err;
+       err = cxio_init_resource_fifo(&rscp->pdid_fifo, &rscp->pdid_fifo_lock,
+                                     nr_pdid, 1, 0);
+       if (err)
+               goto pdid_err;
+       return 0;
+pdid_err:
+       kfifo_free(rscp->cqid_fifo);
+cqid_err:
+       kfifo_free(rscp->qpid_fifo);
+qpid_err:
+       kfifo_free(rscp->tpt_fifo);
+tpt_err:
+       return -ENOMEM;
+}
+
+/*
+ * returns 0 if no resource available
+ */
+static inline u32 cxio_hal_get_resource(struct kfifo *fifo)
+{
+       u32 entry;
+       if (kfifo_get(fifo, (unsigned char *) &entry, sizeof(u32)))
+               return entry;
+       else
+               return 0;       /* fifo emptry */
+}
+
+static inline void cxio_hal_put_resource(struct kfifo *fifo, u32 entry)
+{
+       BUG_ON(kfifo_put(fifo, (unsigned char *) &entry, sizeof(u32)) == 0);
+}
+
+u32 cxio_hal_get_rhdl(void)
+{
+       return cxio_hal_get_resource(rhdl_fifo);
+}
+
+void cxio_hal_put_rhdl(u32 rhdl)
+{
+       cxio_hal_put_resource(rhdl_fifo, rhdl);
+}
+
+u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp)
+{
+       return cxio_hal_get_resource(rscp->tpt_fifo);
+}
+
+void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag)
+{
+       cxio_hal_put_resource(rscp->tpt_fifo, stag);
+}
+
+u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp)
+{
+       u32 qpid = cxio_hal_get_resource(rscp->qpid_fifo);
+       PDBG("%s qpid 0x%x\n", __FUNCTION__, qpid);
+       return qpid;
+}
+
+void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid)
+{
+       PDBG("%s qpid 0x%x\n", __FUNCTION__, qpid);
+       cxio_hal_put_resource(rscp->qpid_fifo, qpid);
+}
+
+u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp)
+{
+       return cxio_hal_get_resource(rscp->cqid_fifo);
+}
+
+void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid)
+{
+       cxio_hal_put_resource(rscp->cqid_fifo, cqid);
+}
+
+u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp)
+{
+       return cxio_hal_get_resource(rscp->pdid_fifo);
+}
+
+void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid)
+{
+       cxio_hal_put_resource(rscp->pdid_fifo, pdid);
+}
+
+void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp)
+{
+       kfifo_free(rscp->tpt_fifo);
+       kfifo_free(rscp->cqid_fifo);
+       kfifo_free(rscp->qpid_fifo);
+       kfifo_free(rscp->pdid_fifo);
+       kfree(rscp);
+}
+
+/*
+ * PBL Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_PBL_SHIFT 8                        /* 256B == min PBL size (32 entries) */
+#define PBL_CHUNK 2*1024*1024
+
+u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size)
+{
+       unsigned long addr = gen_pool_alloc(rdev_p->pbl_pool, size);
+       PDBG("%s addr 0x%x size %d\n", __FUNCTION__, (u32)addr, size);
+       return (u32)addr;
+}
+
+void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size)
+{
+       PDBG("%s addr 0x%x size %d\n", __FUNCTION__, addr, size);
+       gen_pool_free(rdev_p->pbl_pool, (unsigned long)addr, size);
+}
+
+int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p)
+{
+       unsigned long i;
+       rdev_p->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1);
+       if (rdev_p->pbl_pool)
+               for (i = rdev_p->rnic_info.pbl_base;
+                    i <= rdev_p->rnic_info.pbl_top - PBL_CHUNK + 1;
+                    i += PBL_CHUNK)
+                       gen_pool_add(rdev_p->pbl_pool, i, PBL_CHUNK, -1);
+       return rdev_p->pbl_pool ? 0 : -ENOMEM;
+}
+
+void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p)
+{
+       gen_pool_destroy(rdev_p->pbl_pool);
+}
+
+/*
+ * RQT Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_RQT_SHIFT 10       /* 1KB == mini RQT size (16 entries) */
+#define RQT_CHUNK 2*1024*1024
+
+u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size)
+{
+       unsigned long addr = gen_pool_alloc(rdev_p->rqt_pool, size << 6);
+       PDBG("%s addr 0x%x size %d\n", __FUNCTION__, (u32)addr, size << 6);
+       return (u32)addr;
+}
+
+void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size)
+{
+       PDBG("%s addr 0x%x size %d\n", __FUNCTION__, addr, size << 6);
+       gen_pool_free(rdev_p->rqt_pool, (unsigned long)addr, size << 6);
+}
+
+int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p)
+{
+       unsigned long i;
+       rdev_p->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1);
+       if (rdev_p->rqt_pool)
+               for (i = rdev_p->rnic_info.rqt_base;
+                    i <= rdev_p->rnic_info.rqt_top - RQT_CHUNK + 1;
+                    i += RQT_CHUNK)
+                       gen_pool_add(rdev_p->rqt_pool, i, RQT_CHUNK, -1);
+       return rdev_p->rqt_pool ? 0 : -ENOMEM;
+}
+
+void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p)
+{
+       gen_pool_destroy(rdev_p->rqt_pool);
+}
diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.h b/drivers/infiniband/hw/cxgb3/cxio_resource.h
new file mode 100644 (file)
index 0000000..a6bbe83
--- /dev/null
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __CXIO_RESOURCE_H__
+#define __CXIO_RESOURCE_H__
+
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/kfifo.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/genalloc.h>
+#include "cxio_hal.h"
+
+extern int cxio_hal_init_rhdl_resource(u32 nr_rhdl);
+extern void cxio_hal_destroy_rhdl_resource(void);
+extern int cxio_hal_init_resource(struct cxio_rdev *rdev_p,
+                                 u32 nr_tpt, u32 nr_pbl,
+                                 u32 nr_rqt, u32 nr_qpid, u32 nr_cqid,
+                                 u32 nr_pdid);
+extern u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp);
+extern void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag);
+extern u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp);
+extern void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid);
+extern u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp);
+extern void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid);
+extern void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp);
+
+#define PBL_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.pbl_base )
+extern int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p);
+extern void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p);
+extern u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size);
+extern void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size);
+
+#define RQT_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.rqt_base )
+extern int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p);
+extern void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p);
+extern u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size);
+extern void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size);
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h
new file mode 100644 (file)
index 0000000..103fc42
--- /dev/null
@@ -0,0 +1,685 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __CXIO_WR_H__
+#define __CXIO_WR_H__
+
+#include <asm/io.h>
+#include <linux/pci.h>
+#include <linux/timer.h>
+#include "firmware_exports.h"
+
+#define T3_MAX_SGE      4
+
+#define Q_EMPTY(rptr,wptr) ((rptr)==(wptr))
+#define Q_FULL(rptr,wptr,size_log2)  ( (((wptr)-(rptr))>>(size_log2)) && \
+                                      ((rptr)!=(wptr)) )
+#define Q_GENBIT(ptr,size_log2) (!(((ptr)>>size_log2)&0x1))
+#define Q_FREECNT(rptr,wptr,size_log2) ((1UL<<size_log2)-((wptr)-(rptr)))
+#define Q_COUNT(rptr,wptr) ((wptr)-(rptr))
+#define Q_PTR2IDX(ptr,size_log2) (ptr & ((1UL<<size_log2)-1))
+
+static inline void ring_doorbell(void __iomem *doorbell, u32 qpid)
+{
+       writel(((1<<31) | qpid), doorbell);
+}
+
+#define SEQ32_GE(x,y) (!( (((u32) (x)) - ((u32) (y))) & 0x80000000 ))
+
+enum t3_wr_flags {
+       T3_COMPLETION_FLAG = 0x01,
+       T3_NOTIFY_FLAG = 0x02,
+       T3_SOLICITED_EVENT_FLAG = 0x04,
+       T3_READ_FENCE_FLAG = 0x08,
+       T3_LOCAL_FENCE_FLAG = 0x10
+} __attribute__ ((packed));
+
+enum t3_wr_opcode {
+       T3_WR_BP = FW_WROPCODE_RI_BYPASS,
+       T3_WR_SEND = FW_WROPCODE_RI_SEND,
+       T3_WR_WRITE = FW_WROPCODE_RI_RDMA_WRITE,
+       T3_WR_READ = FW_WROPCODE_RI_RDMA_READ,
+       T3_WR_INV_STAG = FW_WROPCODE_RI_LOCAL_INV,
+       T3_WR_BIND = FW_WROPCODE_RI_BIND_MW,
+       T3_WR_RCV = FW_WROPCODE_RI_RECEIVE,
+       T3_WR_INIT = FW_WROPCODE_RI_RDMA_INIT,
+       T3_WR_QP_MOD = FW_WROPCODE_RI_MODIFY_QP
+} __attribute__ ((packed));
+
+enum t3_rdma_opcode {
+       T3_RDMA_WRITE,          /* IETF RDMAP v1.0 ... */
+       T3_READ_REQ,
+       T3_READ_RESP,
+       T3_SEND,
+       T3_SEND_WITH_INV,
+       T3_SEND_WITH_SE,
+       T3_SEND_WITH_SE_INV,
+       T3_TERMINATE,
+       T3_RDMA_INIT,           /* CHELSIO RI specific ... */
+       T3_BIND_MW,
+       T3_FAST_REGISTER,
+       T3_LOCAL_INV,
+       T3_QP_MOD,
+       T3_BYPASS
+} __attribute__ ((packed));
+
+static inline enum t3_rdma_opcode wr2opcode(enum t3_wr_opcode wrop)
+{
+       switch (wrop) {
+               case T3_WR_BP: return T3_BYPASS;
+               case T3_WR_SEND: return T3_SEND;
+               case T3_WR_WRITE: return T3_RDMA_WRITE;
+               case T3_WR_READ: return T3_READ_REQ;
+               case T3_WR_INV_STAG: return T3_LOCAL_INV;
+               case T3_WR_BIND: return T3_BIND_MW;
+               case T3_WR_INIT: return T3_RDMA_INIT;
+               case T3_WR_QP_MOD: return T3_QP_MOD;
+               default: break;
+       }
+       return -1;
+}
+
+
+/* Work request id */
+union t3_wrid {
+       struct {
+               u32 hi;
+               u32 low;
+       } id0;
+       u64 id1;
+};
+
+#define WRID(wrid)             (wrid.id1)
+#define WRID_GEN(wrid)         (wrid.id0.wr_gen)
+#define WRID_IDX(wrid)         (wrid.id0.wr_idx)
+#define WRID_LO(wrid)          (wrid.id0.wr_lo)
+
+struct fw_riwrh {
+       __be32 op_seop_flags;
+       __be32 gen_tid_len;
+};
+
+#define S_FW_RIWR_OP           24
+#define M_FW_RIWR_OP           0xff
+#define V_FW_RIWR_OP(x)                ((x) << S_FW_RIWR_OP)
+#define G_FW_RIWR_OP(x)        ((((x) >> S_FW_RIWR_OP)) & M_FW_RIWR_OP)
+
+#define S_FW_RIWR_SOPEOP       22
+#define M_FW_RIWR_SOPEOP       0x3
+#define V_FW_RIWR_SOPEOP(x)    ((x) << S_FW_RIWR_SOPEOP)
+
+#define S_FW_RIWR_FLAGS                8
+#define M_FW_RIWR_FLAGS                0x3fffff
+#define V_FW_RIWR_FLAGS(x)     ((x) << S_FW_RIWR_FLAGS)
+#define G_FW_RIWR_FLAGS(x)     ((((x) >> S_FW_RIWR_FLAGS)) & M_FW_RIWR_FLAGS)
+
+#define S_FW_RIWR_TID          8
+#define V_FW_RIWR_TID(x)       ((x) << S_FW_RIWR_TID)
+
+#define S_FW_RIWR_LEN          0
+#define V_FW_RIWR_LEN(x)       ((x) << S_FW_RIWR_LEN)
+
+#define S_FW_RIWR_GEN           31
+#define V_FW_RIWR_GEN(x)        ((x)  << S_FW_RIWR_GEN)
+
+struct t3_sge {
+       __be32 stag;
+       __be32 len;
+       __be64 to;
+};
+
+/* If num_sgle is zero, flit 5+ contains immediate data.*/
+struct t3_send_wr {
+       struct fw_riwrh wrh;    /* 0 */
+       union t3_wrid wrid;     /* 1 */
+
+       u8 rdmaop;              /* 2 */
+       u8 reserved[3];
+       __be32 rem_stag;
+       __be32 plen;            /* 3 */
+       __be32 num_sgle;
+       struct t3_sge sgl[T3_MAX_SGE];  /* 4+ */
+};
+
+struct t3_local_inv_wr {
+       struct fw_riwrh wrh;    /* 0 */
+       union t3_wrid wrid;     /* 1 */
+       __be32 stag;            /* 2 */
+       __be32 reserved3;
+};
+
+struct t3_rdma_write_wr {
+       struct fw_riwrh wrh;    /* 0 */
+       union t3_wrid wrid;     /* 1 */
+       u8 rdmaop;              /* 2 */
+       u8 reserved[3];
+       __be32 stag_sink;
+       __be64 to_sink;         /* 3 */
+       __be32 plen;            /* 4 */
+       __be32 num_sgle;
+       struct t3_sge sgl[T3_MAX_SGE];  /* 5+ */
+};
+
+struct t3_rdma_read_wr {
+       struct fw_riwrh wrh;    /* 0 */
+       union t3_wrid wrid;     /* 1 */
+       u8 rdmaop;              /* 2 */
+       u8 reserved[3];
+       __be32 rem_stag;
+       __be64 rem_to;          /* 3 */
+       __be32 local_stag;      /* 4 */
+       __be32 local_len;
+       __be64 local_to;        /* 5 */
+};
+
+enum t3_addr_type {
+       T3_VA_BASED_TO = 0x0,
+       T3_ZERO_BASED_TO = 0x1
+} __attribute__ ((packed));
+
+enum t3_mem_perms {
+       T3_MEM_ACCESS_LOCAL_READ = 0x1,
+       T3_MEM_ACCESS_LOCAL_WRITE = 0x2,
+       T3_MEM_ACCESS_REM_READ = 0x4,
+       T3_MEM_ACCESS_REM_WRITE = 0x8
+} __attribute__ ((packed));
+
+struct t3_bind_mw_wr {
+       struct fw_riwrh wrh;    /* 0 */
+       union t3_wrid wrid;     /* 1 */
+       u16 reserved;           /* 2 */
+       u8 type;
+       u8 perms;
+       __be32 mr_stag;
+       __be32 mw_stag;         /* 3 */
+       __be32 mw_len;
+       __be64 mw_va;           /* 4 */
+       __be32 mr_pbl_addr;     /* 5 */
+       u8 reserved2[3];
+       u8 mr_pagesz;
+};
+
+struct t3_receive_wr {
+       struct fw_riwrh wrh;    /* 0 */
+       union t3_wrid wrid;     /* 1 */
+       u8 pagesz[T3_MAX_SGE];
+       __be32 num_sgle;                /* 2 */
+       struct t3_sge sgl[T3_MAX_SGE];  /* 3+ */
+       __be32 pbl_addr[T3_MAX_SGE];
+};
+
+struct t3_bypass_wr {
+       struct fw_riwrh wrh;
+       union t3_wrid wrid;     /* 1 */
+};
+
+struct t3_modify_qp_wr {
+       struct fw_riwrh wrh;    /* 0 */
+       union t3_wrid wrid;     /* 1 */
+       __be32 flags;           /* 2 */
+       __be32 quiesce;         /* 2 */
+       __be32 max_ird;         /* 3 */
+       __be32 max_ord;         /* 3 */
+       __be64 sge_cmd;         /* 4 */
+       __be64 ctx1;            /* 5 */
+       __be64 ctx0;            /* 6 */
+};
+
+enum t3_modify_qp_flags {
+       MODQP_QUIESCE  = 0x01,
+       MODQP_MAX_IRD  = 0x02,
+       MODQP_MAX_ORD  = 0x04,
+       MODQP_WRITE_EC = 0x08,
+       MODQP_READ_EC  = 0x10,
+};
+
+
+enum t3_mpa_attrs {
+       uP_RI_MPA_RX_MARKER_ENABLE = 0x1,
+       uP_RI_MPA_TX_MARKER_ENABLE = 0x2,
+       uP_RI_MPA_CRC_ENABLE = 0x4,
+       uP_RI_MPA_IETF_ENABLE = 0x8
+} __attribute__ ((packed));
+
+enum t3_qp_caps {
+       uP_RI_QP_RDMA_READ_ENABLE = 0x01,
+       uP_RI_QP_RDMA_WRITE_ENABLE = 0x02,
+       uP_RI_QP_BIND_ENABLE = 0x04,
+       uP_RI_QP_FAST_REGISTER_ENABLE = 0x08,
+       uP_RI_QP_STAG0_ENABLE = 0x10
+} __attribute__ ((packed));
+
+struct t3_rdma_init_attr {
+       u32 tid;
+       u32 qpid;
+       u32 pdid;
+       u32 scqid;
+       u32 rcqid;
+       u32 rq_addr;
+       u32 rq_size;
+       enum t3_mpa_attrs mpaattrs;
+       enum t3_qp_caps qpcaps;
+       u16 tcp_emss;
+       u32 ord;
+       u32 ird;
+       u64 qp_dma_addr;
+       u32 qp_dma_size;
+       u32 flags;
+};
+
+struct t3_rdma_init_wr {
+       struct fw_riwrh wrh;    /* 0 */
+       union t3_wrid wrid;     /* 1 */
+       __be32 qpid;            /* 2 */
+       __be32 pdid;
+       __be32 scqid;           /* 3 */
+       __be32 rcqid;
+       __be32 rq_addr;         /* 4 */
+       __be32 rq_size;
+       u8 mpaattrs;            /* 5 */
+       u8 qpcaps;
+       __be16 ulpdu_size;
+       __be32 flags;           /* bits 31-1 - reservered */
+                               /* bit     0 - set if RECV posted */
+       __be32 ord;             /* 6 */
+       __be32 ird;
+       __be64 qp_dma_addr;     /* 7 */
+       __be32 qp_dma_size;     /* 8 */
+       u32 rsvd;
+};
+
+struct t3_genbit {
+       u64 flit[15];
+       __be64 genbit;
+};
+
+enum rdma_init_wr_flags {
+       RECVS_POSTED = 1,
+};
+
+union t3_wr {
+       struct t3_send_wr send;
+       struct t3_rdma_write_wr write;
+       struct t3_rdma_read_wr read;
+       struct t3_receive_wr recv;
+       struct t3_local_inv_wr local_inv;
+       struct t3_bind_mw_wr bind;
+       struct t3_bypass_wr bypass;
+       struct t3_rdma_init_wr init;
+       struct t3_modify_qp_wr qp_mod;
+       struct t3_genbit genbit;
+       u64 flit[16];
+};
+
+#define T3_SQ_CQE_FLIT   13
+#define T3_SQ_COOKIE_FLIT 14
+
+#define T3_RQ_COOKIE_FLIT 13
+#define T3_RQ_CQE_FLIT   14
+
+static inline enum t3_wr_opcode fw_riwrh_opcode(struct fw_riwrh *wqe)
+{
+       return G_FW_RIWR_OP(be32_to_cpu(wqe->op_seop_flags));
+}
+
+static inline void build_fw_riwrh(struct fw_riwrh *wqe, enum t3_wr_opcode op,
+                                 enum t3_wr_flags flags, u8 genbit, u32 tid,
+                                 u8 len)
+{
+       wqe->op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(op) |
+                                        V_FW_RIWR_SOPEOP(M_FW_RIWR_SOPEOP) |
+                                        V_FW_RIWR_FLAGS(flags));
+       wmb();
+       wqe->gen_tid_len = cpu_to_be32(V_FW_RIWR_GEN(genbit) |
+                                      V_FW_RIWR_TID(tid) |
+                                      V_FW_RIWR_LEN(len));
+       /* 2nd gen bit... */
+       ((union t3_wr *)wqe)->genbit.genbit = cpu_to_be64(genbit);
+}
+
+/*
+ * T3 ULP2_TX commands
+ */
+enum t3_utx_mem_op {
+       T3_UTX_MEM_READ = 2,
+       T3_UTX_MEM_WRITE = 3
+};
+
+/* T3 MC7 RDMA TPT entry format */
+
+enum tpt_mem_type {
+       TPT_NON_SHARED_MR = 0x0,
+       TPT_SHARED_MR = 0x1,
+       TPT_MW = 0x2,
+       TPT_MW_RELAXED_PROTECTION = 0x3
+};
+
+enum tpt_addr_type {
+       TPT_ZBTO = 0,
+       TPT_VATO = 1
+};
+
+enum tpt_mem_perm {
+       TPT_LOCAL_READ = 0x8,
+       TPT_LOCAL_WRITE = 0x4,
+       TPT_REMOTE_READ = 0x2,
+       TPT_REMOTE_WRITE = 0x1
+};
+
+struct tpt_entry {
+       __be32 valid_stag_pdid;
+       __be32 flags_pagesize_qpid;
+
+       __be32 rsvd_pbl_addr;
+       __be32 len;
+       __be32 va_hi;
+       __be32 va_low_or_fbo;
+
+       __be32 rsvd_bind_cnt_or_pstag;
+       __be32 rsvd_pbl_size;
+};
+
+#define S_TPT_VALID            31
+#define V_TPT_VALID(x)         ((x) << S_TPT_VALID)
+#define F_TPT_VALID            V_TPT_VALID(1U)
+
+#define S_TPT_STAG_KEY         23
+#define M_TPT_STAG_KEY         0xFF
+#define V_TPT_STAG_KEY(x)      ((x) << S_TPT_STAG_KEY)
+#define G_TPT_STAG_KEY(x)      (((x) >> S_TPT_STAG_KEY) & M_TPT_STAG_KEY)
+
+#define S_TPT_STAG_STATE       22
+#define V_TPT_STAG_STATE(x)    ((x) << S_TPT_STAG_STATE)
+#define F_TPT_STAG_STATE       V_TPT_STAG_STATE(1U)
+
+#define S_TPT_STAG_TYPE                20
+#define M_TPT_STAG_TYPE                0x3
+#define V_TPT_STAG_TYPE(x)     ((x) << S_TPT_STAG_TYPE)
+#define G_TPT_STAG_TYPE(x)     (((x) >> S_TPT_STAG_TYPE) & M_TPT_STAG_TYPE)
+
+#define S_TPT_PDID             0
+#define M_TPT_PDID             0xFFFFF
+#define V_TPT_PDID(x)          ((x) << S_TPT_PDID)
+#define G_TPT_PDID(x)          (((x) >> S_TPT_PDID) & M_TPT_PDID)
+
+#define S_TPT_PERM             28
+#define M_TPT_PERM             0xF
+#define V_TPT_PERM(x)          ((x) << S_TPT_PERM)
+#define G_TPT_PERM(x)          (((x) >> S_TPT_PERM) & M_TPT_PERM)
+
+#define S_TPT_REM_INV_DIS      27
+#define V_TPT_REM_INV_DIS(x)   ((x) << S_TPT_REM_INV_DIS)
+#define F_TPT_REM_INV_DIS      V_TPT_REM_INV_DIS(1U)
+
+#define S_TPT_ADDR_TYPE                26
+#define V_TPT_ADDR_TYPE(x)     ((x) << S_TPT_ADDR_TYPE)
+#define F_TPT_ADDR_TYPE                V_TPT_ADDR_TYPE(1U)
+
+#define S_TPT_MW_BIND_ENABLE   25
+#define V_TPT_MW_BIND_ENABLE(x)        ((x) << S_TPT_MW_BIND_ENABLE)
+#define F_TPT_MW_BIND_ENABLE    V_TPT_MW_BIND_ENABLE(1U)
+
+#define S_TPT_PAGE_SIZE                20
+#define M_TPT_PAGE_SIZE                0x1F
+#define V_TPT_PAGE_SIZE(x)     ((x) << S_TPT_PAGE_SIZE)
+#define G_TPT_PAGE_SIZE(x)     (((x) >> S_TPT_PAGE_SIZE) & M_TPT_PAGE_SIZE)
+
+#define S_TPT_PBL_ADDR         0
+#define M_TPT_PBL_ADDR         0x1FFFFFFF
+#define V_TPT_PBL_ADDR(x)      ((x) << S_TPT_PBL_ADDR)
+#define G_TPT_PBL_ADDR(x)       (((x) >> S_TPT_PBL_ADDR) & M_TPT_PBL_ADDR)
+
+#define S_TPT_QPID             0
+#define M_TPT_QPID             0xFFFFF
+#define V_TPT_QPID(x)          ((x) << S_TPT_QPID)
+#define G_TPT_QPID(x)          (((x) >> S_TPT_QPID) & M_TPT_QPID)
+
+#define S_TPT_PSTAG            0
+#define M_TPT_PSTAG            0xFFFFFF
+#define V_TPT_PSTAG(x)         ((x) << S_TPT_PSTAG)
+#define G_TPT_PSTAG(x)         (((x) >> S_TPT_PSTAG) & M_TPT_PSTAG)
+
+#define S_TPT_PBL_SIZE         0
+#define M_TPT_PBL_SIZE         0xFFFFF
+#define V_TPT_PBL_SIZE(x)      ((x) << S_TPT_PBL_SIZE)
+#define G_TPT_PBL_SIZE(x)      (((x) >> S_TPT_PBL_SIZE) & M_TPT_PBL_SIZE)
+
+/*
+ * CQE defs
+ */
+struct t3_cqe {
+       __be32 header;
+       __be32 len;
+       union {
+               struct {
+                       __be32 stag;
+                       __be32 msn;
+               } rcqe;
+               struct {
+                       u32 wrid_hi;
+                       u32 wrid_low;
+               } scqe;
+       } u;
+};
+
+#define S_CQE_OOO        31
+#define M_CQE_OOO        0x1
+#define G_CQE_OOO(x)     ((((x) >> S_CQE_OOO)) & M_CQE_OOO)
+#define V_CEQ_OOO(x)     ((x)<<S_CQE_OOO)
+
+#define S_CQE_QPID        12
+#define M_CQE_QPID        0x7FFFF
+#define G_CQE_QPID(x)     ((((x) >> S_CQE_QPID)) & M_CQE_QPID)
+#define V_CQE_QPID(x)    ((x)<<S_CQE_QPID)
+
+#define S_CQE_SWCQE       11
+#define M_CQE_SWCQE       0x1
+#define G_CQE_SWCQE(x)    ((((x) >> S_CQE_SWCQE)) & M_CQE_SWCQE)
+#define V_CQE_SWCQE(x)   ((x)<<S_CQE_SWCQE)
+
+#define S_CQE_GENBIT      10
+#define M_CQE_GENBIT      0x1
+#define G_CQE_GENBIT(x)   (((x) >> S_CQE_GENBIT) & M_CQE_GENBIT)
+#define V_CQE_GENBIT(x)          ((x)<<S_CQE_GENBIT)
+
+#define S_CQE_STATUS      5
+#define M_CQE_STATUS      0x1F
+#define G_CQE_STATUS(x)   ((((x) >> S_CQE_STATUS)) & M_CQE_STATUS)
+#define V_CQE_STATUS(x)   ((x)<<S_CQE_STATUS)
+
+#define S_CQE_TYPE        4
+#define M_CQE_TYPE        0x1
+#define G_CQE_TYPE(x)     ((((x) >> S_CQE_TYPE)) & M_CQE_TYPE)
+#define V_CQE_TYPE(x)     ((x)<<S_CQE_TYPE)
+
+#define S_CQE_OPCODE      0
+#define M_CQE_OPCODE      0xF
+#define G_CQE_OPCODE(x)   ((((x) >> S_CQE_OPCODE)) & M_CQE_OPCODE)
+#define V_CQE_OPCODE(x)   ((x)<<S_CQE_OPCODE)
+
+#define SW_CQE(x)         (G_CQE_SWCQE(be32_to_cpu((x).header)))
+#define CQE_OOO(x)        (G_CQE_OOO(be32_to_cpu((x).header)))
+#define CQE_QPID(x)       (G_CQE_QPID(be32_to_cpu((x).header)))
+#define CQE_GENBIT(x)     (G_CQE_GENBIT(be32_to_cpu((x).header)))
+#define CQE_TYPE(x)       (G_CQE_TYPE(be32_to_cpu((x).header)))
+#define SQ_TYPE(x)       (CQE_TYPE((x)))
+#define RQ_TYPE(x)       (!CQE_TYPE((x)))
+#define CQE_STATUS(x)     (G_CQE_STATUS(be32_to_cpu((x).header)))
+#define CQE_OPCODE(x)     (G_CQE_OPCODE(be32_to_cpu((x).header)))
+
+#define CQE_LEN(x)        (be32_to_cpu((x).len))
+
+/* used for RQ completion processing */
+#define CQE_WRID_STAG(x)  (be32_to_cpu((x).u.rcqe.stag))
+#define CQE_WRID_MSN(x)   (be32_to_cpu((x).u.rcqe.msn))
+
+/* used for SQ completion processing */
+#define CQE_WRID_SQ_WPTR(x)    ((x).u.scqe.wrid_hi)
+#define CQE_WRID_WPTR(x)       ((x).u.scqe.wrid_low)
+
+/* generic accessor macros */
+#define CQE_WRID_HI(x)         ((x).u.scqe.wrid_hi)
+#define CQE_WRID_LOW(x)                ((x).u.scqe.wrid_low)
+
+#define TPT_ERR_SUCCESS                     0x0
+#define TPT_ERR_STAG                        0x1         /* STAG invalid: either the */
+                                                /* STAG is offlimt, being 0, */
+                                                /* or STAG_key mismatch */
+#define TPT_ERR_PDID                        0x2         /* PDID mismatch */
+#define TPT_ERR_QPID                        0x3         /* QPID mismatch */
+#define TPT_ERR_ACCESS                      0x4         /* Invalid access right */
+#define TPT_ERR_WRAP                        0x5         /* Wrap error */
+#define TPT_ERR_BOUND                       0x6         /* base and bounds voilation */
+#define TPT_ERR_INVALIDATE_SHARED_MR        0x7         /* attempt to invalidate a  */
+                                                /* shared memory region */
+#define TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND 0x8         /* attempt to invalidate a  */
+                                                /* shared memory region */
+#define TPT_ERR_ECC                         0x9         /* ECC error detected */
+#define TPT_ERR_ECC_PSTAG                   0xA         /* ECC error detected when  */
+                                                /* reading PSTAG for a MW  */
+                                                /* Invalidate */
+#define TPT_ERR_PBL_ADDR_BOUND              0xB         /* pbl addr out of bounds:  */
+                                                /* software error */
+#define TPT_ERR_SWFLUSH                            0xC  /* SW FLUSHED */
+#define TPT_ERR_CRC                         0x10 /* CRC error */
+#define TPT_ERR_MARKER                      0x11 /* Marker error */
+#define TPT_ERR_PDU_LEN_ERR                 0x12 /* invalid PDU length */
+#define TPT_ERR_OUT_OF_RQE                  0x13 /* out of RQE */
+#define TPT_ERR_DDP_VERSION                 0x14 /* wrong DDP version */
+#define TPT_ERR_RDMA_VERSION                0x15 /* wrong RDMA version */
+#define TPT_ERR_OPCODE                      0x16 /* invalid rdma opcode */
+#define TPT_ERR_DDP_QUEUE_NUM               0x17 /* invalid ddp queue number */
+#define TPT_ERR_MSN                         0x18 /* MSN error */
+#define TPT_ERR_TBIT                        0x19 /* tag bit not set correctly */
+#define TPT_ERR_MO                          0x1A /* MO not 0 for TERMINATE  */
+                                                /* or READ_REQ */
+#define TPT_ERR_MSN_GAP                     0x1B
+#define TPT_ERR_MSN_RANGE                   0x1C
+#define TPT_ERR_IRD_OVERFLOW                0x1D
+#define TPT_ERR_RQE_ADDR_BOUND              0x1E /* RQE addr out of bounds:  */
+                                                /* software error */
+#define TPT_ERR_INTERNAL_ERR                0x1F /* internal error (opcode  */
+                                                /* mismatch) */
+
+struct t3_swsq {
+       __u64                   wr_id;
+       struct t3_cqe           cqe;
+       __u32                   sq_wptr;
+       __be32                  read_len;
+       int                     opcode;
+       int                     complete;
+       int                     signaled;
+};
+
+/*
+ * A T3 WQ implements both the SQ and RQ.
+ */
+struct t3_wq {
+       union t3_wr *queue;             /* DMA accessable memory */
+       dma_addr_t dma_addr;            /* DMA address for HW */
+       DECLARE_PCI_UNMAP_ADDR(mapping) /* unmap kruft */
+       u32 error;                      /* 1 once we go to ERROR */
+       u32 qpid;
+       u32 wptr;                       /* idx to next available WR slot */
+       u32 size_log2;                  /* total wq size */
+       struct t3_swsq *sq;             /* SW SQ */
+       struct t3_swsq *oldest_read;    /* tracks oldest pending read */
+       u32 sq_wptr;                    /* sq_wptr - sq_rptr == count of */
+       u32 sq_rptr;                    /* pending wrs */
+       u32 sq_size_log2;               /* sq size */
+       u64 *rq;                        /* SW RQ (holds consumer wr_ids */
+       u32 rq_wptr;                    /* rq_wptr - rq_rptr == count of */
+       u32 rq_rptr;                    /* pending wrs */
+       u64 *rq_oldest_wr;              /* oldest wr on the SW RQ */
+       u32 rq_size_log2;               /* rq size */
+       u32 rq_addr;                    /* rq adapter address */
+       void __iomem *doorbell;         /* kernel db */
+       u64 udb;                        /* user db if any */
+};
+
+struct t3_cq {
+       u32 cqid;
+       u32 rptr;
+       u32 wptr;
+       u32 size_log2;
+       dma_addr_t dma_addr;
+       DECLARE_PCI_UNMAP_ADDR(mapping)
+       struct t3_cqe *queue;
+       struct t3_cqe *sw_queue;
+       u32 sw_rptr;
+       u32 sw_wptr;
+};
+
+#define CQ_VLD_ENTRY(ptr,size_log2,cqe) (Q_GENBIT(ptr,size_log2) == \
+                                        CQE_GENBIT(*cqe))
+
+static inline void cxio_set_wq_in_error(struct t3_wq *wq)
+{
+       wq->queue->flit[13] = 1;
+}
+
+static inline struct t3_cqe *cxio_next_hw_cqe(struct t3_cq *cq)
+{
+       struct t3_cqe *cqe;
+
+       cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2));
+       if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe))
+               return cqe;
+       return NULL;
+}
+
+static inline struct t3_cqe *cxio_next_sw_cqe(struct t3_cq *cq)
+{
+       struct t3_cqe *cqe;
+
+       if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) {
+               cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2));
+               return cqe;
+       }
+       return NULL;
+}
+
+static inline struct t3_cqe *cxio_next_cqe(struct t3_cq *cq)
+{
+       struct t3_cqe *cqe;
+
+       if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) {
+               cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2));
+               return cqe;
+       }
+       cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2));
+       if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe))
+               return cqe;
+       return NULL;
+}
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c
new file mode 100644 (file)
index 0000000..4611afa
--- /dev/null
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "cxgb3_offload.h"
+#include "iwch_provider.h"
+#include "iwch_user.h"
+#include "iwch.h"
+#include "iwch_cm.h"
+
+#define DRV_VERSION "1.1"
+
+MODULE_AUTHOR("Boyd Faulkner, Steve Wise");
+MODULE_DESCRIPTION("Chelsio T3 RDMA Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS];
+
+static void open_rnic_dev(struct t3cdev *);
+static void close_rnic_dev(struct t3cdev *);
+
+struct cxgb3_client t3c_client = {
+       .name = "iw_cxgb3",
+       .add = open_rnic_dev,
+       .remove = close_rnic_dev,
+       .handlers = t3c_handlers,
+       .redirect = iwch_ep_redirect
+};
+
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(dev_mutex);
+
+static void rnic_init(struct iwch_dev *rnicp)
+{
+       PDBG("%s iwch_dev %p\n", __FUNCTION__,  rnicp);
+       idr_init(&rnicp->cqidr);
+       idr_init(&rnicp->qpidr);
+       idr_init(&rnicp->mmidr);
+       spin_lock_init(&rnicp->lock);
+
+       rnicp->attr.vendor_id = 0x168;
+       rnicp->attr.vendor_part_id = 7;
+       rnicp->attr.max_qps = T3_MAX_NUM_QP - 32;
+       rnicp->attr.max_wrs = (1UL << 24) - 1;
+       rnicp->attr.max_sge_per_wr = T3_MAX_SGE;
+       rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE;
+       rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1;
+       rnicp->attr.max_cqes_per_cq = (1UL << 24) - 1;
+       rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev);
+       rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE;
+       rnicp->attr.max_pds = T3_MAX_NUM_PD - 1;
+       rnicp->attr.mem_pgsizes_bitmask = 0x7FFF;       /* 4KB-128MB */
+       rnicp->attr.can_resize_wq = 0;
+       rnicp->attr.max_rdma_reads_per_qp = 8;
+       rnicp->attr.max_rdma_read_resources =
+           rnicp->attr.max_rdma_reads_per_qp * rnicp->attr.max_qps;
+       rnicp->attr.max_rdma_read_qp_depth = 8; /* IRD */
+       rnicp->attr.max_rdma_read_depth =
+           rnicp->attr.max_rdma_read_qp_depth * rnicp->attr.max_qps;
+       rnicp->attr.rq_overflow_handled = 0;
+       rnicp->attr.can_modify_ird = 0;
+       rnicp->attr.can_modify_ord = 0;
+       rnicp->attr.max_mem_windows = rnicp->attr.max_mem_regs - 1;
+       rnicp->attr.stag0_value = 1;
+       rnicp->attr.zbva_support = 1;
+       rnicp->attr.local_invalidate_fence = 1;
+       rnicp->attr.cq_overflow_detection = 1;
+       return;
+}
+
+static void open_rnic_dev(struct t3cdev *tdev)
+{
+       struct iwch_dev *rnicp;
+       static int vers_printed;
+
+       PDBG("%s t3cdev %p\n", __FUNCTION__,  tdev);
+       if (!vers_printed++)
+               printk(KERN_INFO MOD "Chelsio T3 RDMA Driver - version %s\n",
+                      DRV_VERSION);
+       rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp));
+       if (!rnicp) {
+               printk(KERN_ERR MOD "Cannot allocate ib device\n");
+               return;
+       }
+       rnicp->rdev.ulp = rnicp;
+       rnicp->rdev.t3cdev_p = tdev;
+
+       mutex_lock(&dev_mutex);
+
+       if (cxio_rdev_open(&rnicp->rdev)) {
+               mutex_unlock(&dev_mutex);
+               printk(KERN_ERR MOD "Unable to open CXIO rdev\n");
+               ib_dealloc_device(&rnicp->ibdev);
+               return;
+       }
+
+       rnic_init(rnicp);
+
+       list_add_tail(&rnicp->entry, &dev_list);
+       mutex_unlock(&dev_mutex);
+
+       if (iwch_register_device(rnicp)) {
+               printk(KERN_ERR MOD "Unable to register device\n");
+               close_rnic_dev(tdev);
+       }
+       printk(KERN_INFO MOD "Initialized device %s\n",
+              pci_name(rnicp->rdev.rnic_info.pdev));
+       return;
+}
+
+static void close_rnic_dev(struct t3cdev *tdev)
+{
+       struct iwch_dev *dev, *tmp;
+       PDBG("%s t3cdev %p\n", __FUNCTION__,  tdev);
+       mutex_lock(&dev_mutex);
+       list_for_each_entry_safe(dev, tmp, &dev_list, entry) {
+               if (dev->rdev.t3cdev_p == tdev) {
+                       list_del(&dev->entry);
+                       iwch_unregister_device(dev);
+                       cxio_rdev_close(&dev->rdev);
+                       idr_destroy(&dev->cqidr);
+                       idr_destroy(&dev->qpidr);
+                       idr_destroy(&dev->mmidr);
+                       ib_dealloc_device(&dev->ibdev);
+                       break;
+               }
+       }
+       mutex_unlock(&dev_mutex);
+}
+
+static int __init iwch_init_module(void)
+{
+       int err;
+
+       err = cxio_hal_init();
+       if (err)
+               return err;
+       err = iwch_cm_init();
+       if (err)
+               return err;
+       cxio_register_ev_cb(iwch_ev_dispatch);
+       cxgb3_register_client(&t3c_client);
+       return 0;
+}
+
+static void __exit iwch_exit_module(void)
+{
+       cxgb3_unregister_client(&t3c_client);
+       cxio_unregister_ev_cb(iwch_ev_dispatch);
+       iwch_cm_term();
+       cxio_hal_exit();
+}
+
+module_init(iwch_init_module);
+module_exit(iwch_exit_module);
diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h
new file mode 100644 (file)
index 0000000..6517ef8
--- /dev/null
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IWCH_H__
+#define __IWCH_H__
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "cxio_hal.h"
+#include "cxgb3_offload.h"
+
+struct iwch_pd;
+struct iwch_cq;
+struct iwch_qp;
+struct iwch_mr;
+
+struct iwch_rnic_attributes {
+       u32 vendor_id;
+       u32 vendor_part_id;
+       u32 max_qps;
+       u32 max_wrs;                            /* Max for any SQ/RQ */
+       u32 max_sge_per_wr;
+       u32 max_sge_per_rdma_write_wr;  /* for RDMA Write WR */
+       u32 max_cqs;
+       u32 max_cqes_per_cq;
+       u32 max_mem_regs;
+       u32 max_phys_buf_entries;               /* for phys buf list */
+       u32 max_pds;
+
+       /*
+        * The memory page sizes supported by this RNIC.
+        * Bit position i in bitmap indicates page of
+        * size (4k)^i.  Phys block list mode unsupported.
+        */
+       u32 mem_pgsizes_bitmask;
+       u8 can_resize_wq;
+
+       /*
+        * The maximum number of RDMA Reads that can be outstanding
+        * per QP with this RNIC as the target.
+        */
+       u32 max_rdma_reads_per_qp;
+
+       /*
+        * The maximum number of resources used for RDMA Reads
+        * by this RNIC with this RNIC as the target.
+        */
+       u32 max_rdma_read_resources;
+
+       /*
+        * The max depth per QP for initiation of RDMA Read
+        * by this RNIC.
+        */
+       u32 max_rdma_read_qp_depth;
+
+       /*
+        * The maximum depth for initiation of RDMA Read
+        * operations by this RNIC on all QPs
+        */
+       u32 max_rdma_read_depth;
+       u8 rq_overflow_handled;
+       u32 can_modify_ird;
+       u32 can_modify_ord;
+       u32 max_mem_windows;
+       u32 stag0_value;
+       u8 zbva_support;
+       u8 local_invalidate_fence;
+       u32 cq_overflow_detection;
+};
+
+struct iwch_dev {
+       struct ib_device ibdev;
+       struct cxio_rdev rdev;
+       u32 device_cap_flags;
+       struct iwch_rnic_attributes attr;
+       struct idr cqidr;
+       struct idr qpidr;
+       struct idr mmidr;
+       spinlock_t lock;
+       struct list_head entry;
+};
+
+static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev)
+{
+       return container_of(ibdev, struct iwch_dev, ibdev);
+}
+
+static inline int t3b_device(const struct iwch_dev *rhp)
+{
+       return rhp->rdev.t3cdev_p->type == T3B;
+}
+
+static inline int t3a_device(const struct iwch_dev *rhp)
+{
+       return rhp->rdev.t3cdev_p->type == T3A;
+}
+
+static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid)
+{
+       return idr_find(&rhp->cqidr, cqid);
+}
+
+static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid)
+{
+       return idr_find(&rhp->qpidr, qpid);
+}
+
+static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid)
+{
+       return idr_find(&rhp->mmidr, mmid);
+}
+
+static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr,
+                               void *handle, u32 id)
+{
+       int ret;
+       u32 newid;
+
+       do {
+               if (!idr_pre_get(idr, GFP_KERNEL)) {
+                       return -ENOMEM;
+               }
+               spin_lock_irq(&rhp->lock);
+               ret = idr_get_new_above(idr, handle, id, &newid);
+               BUG_ON(newid != id);
+               spin_unlock_irq(&rhp->lock);
+       } while (ret == -EAGAIN);
+
+       return ret;
+}
+
+static inline void remove_handle(struct iwch_dev *rhp, struct idr *idr, u32 id)
+{
+       spin_lock_irq(&rhp->lock);
+       idr_remove(idr, id);
+       spin_unlock_irq(&rhp->lock);
+}
+
+extern struct cxgb3_client t3c_client;
+extern cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS];
+extern void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb);
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
new file mode 100644 (file)
index 0000000..a522b1b
--- /dev/null
@@ -0,0 +1,2081 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+#include <linux/notifier.h>
+
+#include <net/neighbour.h>
+#include <net/netevent.h>
+#include <net/route.h>
+
+#include "tcb.h"
+#include "cxgb3_offload.h"
+#include "iwch.h"
+#include "iwch_provider.h"
+#include "iwch_cm.h"
+
+static char *states[] = {
+       "idle",
+       "listen",
+       "connecting",
+       "mpa_wait_req",
+       "mpa_req_sent",
+       "mpa_req_rcvd",
+       "mpa_rep_sent",
+       "fpdu_mode",
+       "aborting",
+       "closing",
+       "moribund",
+       "dead",
+       NULL,
+};
+
+static int ep_timeout_secs = 10;
+module_param(ep_timeout_secs, int, 0444);
+MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
+                                  "in seconds (default=10)");
+
+static int mpa_rev = 1;
+module_param(mpa_rev, int, 0444);
+MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, "
+                "1 is spec compliant. (default=1)");
+
+static int markers_enabled = 0;
+module_param(markers_enabled, int, 0444);
+MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)");
+
+static int crc_enabled = 1;
+module_param(crc_enabled, int, 0444);
+MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)");
+
+static int rcv_win = 256 * 1024;
+module_param(rcv_win, int, 0444);
+MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256)");
+
+static int snd_win = 32 * 1024;
+module_param(snd_win, int, 0444);
+MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)");
+
+static unsigned int nocong = 0;
+module_param(nocong, uint, 0444);
+MODULE_PARM_DESC(nocong, "Turn off congestion control (default=0)");
+
+static unsigned int cong_flavor = 1;
+module_param(cong_flavor, uint, 0444);
+MODULE_PARM_DESC(cong_flavor, "TCP Congestion control flavor (default=1)");
+
+static void process_work(struct work_struct *work);
+static struct workqueue_struct *workq;
+static DECLARE_WORK(skb_work, process_work);
+
+static struct sk_buff_head rxq;
+static cxgb3_cpl_handler_func work_handlers[NUM_CPL_CMDS];
+
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp);
+static void ep_timeout(unsigned long arg);
+static void connect_reply_upcall(struct iwch_ep *ep, int status);
+
+static void start_ep_timer(struct iwch_ep *ep)
+{
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       if (timer_pending(&ep->timer)) {
+               PDBG("%s stopped / restarted timer ep %p\n", __FUNCTION__, ep);
+               del_timer_sync(&ep->timer);
+       } else
+               get_ep(&ep->com);
+       ep->timer.expires = jiffies + ep_timeout_secs * HZ;
+       ep->timer.data = (unsigned long)ep;
+       ep->timer.function = ep_timeout;
+       add_timer(&ep->timer);
+}
+
+static void stop_ep_timer(struct iwch_ep *ep)
+{
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       del_timer_sync(&ep->timer);
+       put_ep(&ep->com);
+}
+
+static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb)
+{
+       struct cpl_tid_release *req;
+
+       skb = get_skb(skb, sizeof *req, GFP_KERNEL);
+       if (!skb)
+               return;
+       req = (struct cpl_tid_release *) skb_put(skb, sizeof(*req));
+       req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+       OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, hwtid));
+       skb->priority = CPL_PRIORITY_SETUP;
+       tdev->send(tdev, skb);
+       return;
+}
+
+int iwch_quiesce_tid(struct iwch_ep *ep)
+{
+       struct cpl_set_tcb_field *req;
+       struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+
+       if (!skb)
+               return -ENOMEM;
+       req = (struct cpl_set_tcb_field *) skb_put(skb, sizeof(*req));
+       req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+       req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+       OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
+       req->reply = 0;
+       req->cpu_idx = 0;
+       req->word = htons(W_TCB_RX_QUIESCE);
+       req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
+       req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE);
+
+       skb->priority = CPL_PRIORITY_DATA;
+       ep->com.tdev->send(ep->com.tdev, skb);
+       return 0;
+}
+
+int iwch_resume_tid(struct iwch_ep *ep)
+{
+       struct cpl_set_tcb_field *req;
+       struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+
+       if (!skb)
+               return -ENOMEM;
+       req = (struct cpl_set_tcb_field *) skb_put(skb, sizeof(*req));
+       req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+       req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+       OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
+       req->reply = 0;
+       req->cpu_idx = 0;
+       req->word = htons(W_TCB_RX_QUIESCE);
+       req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
+       req->val = 0;
+
+       skb->priority = CPL_PRIORITY_DATA;
+       ep->com.tdev->send(ep->com.tdev, skb);
+       return 0;
+}
+
+static void set_emss(struct iwch_ep *ep, u16 opt)
+{
+       PDBG("%s ep %p opt %u\n", __FUNCTION__, ep, opt);
+       ep->emss = T3C_DATA(ep->com.tdev)->mtus[G_TCPOPT_MSS(opt)] - 40;
+       if (G_TCPOPT_TSTAMP(opt))
+               ep->emss -= 12;
+       if (ep->emss < 128)
+               ep->emss = 128;
+       PDBG("emss=%d\n", ep->emss);
+}
+
+static enum iwch_ep_state state_read(struct iwch_ep_common *epc)
+{
+       unsigned long flags;
+       enum iwch_ep_state state;
+
+       spin_lock_irqsave(&epc->lock, flags);
+       state = epc->state;
+       spin_unlock_irqrestore(&epc->lock, flags);
+       return state;
+}
+
+static inline void __state_set(struct iwch_ep_common *epc,
+                              enum iwch_ep_state new)
+{
+       epc->state = new;
+}
+
+static void state_set(struct iwch_ep_common *epc, enum iwch_ep_state new)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&epc->lock, flags);
+       PDBG("%s - %s -> %s\n", __FUNCTION__, states[epc->state], states[new]);
+       __state_set(epc, new);
+       spin_unlock_irqrestore(&epc->lock, flags);
+       return;
+}
+
+static void *alloc_ep(int size, gfp_t gfp)
+{
+       struct iwch_ep_common *epc;
+
+       epc = kmalloc(size, gfp);
+       if (epc) {
+               memset(epc, 0, size);
+               kref_init(&epc->kref);
+               spin_lock_init(&epc->lock);
+               init_waitqueue_head(&epc->waitq);
+       }
+       PDBG("%s alloc ep %p\n", __FUNCTION__, epc);
+       return epc;
+}
+
+void __free_ep(struct kref *kref)
+{
+       struct iwch_ep_common *epc;
+       epc = container_of(kref, struct iwch_ep_common, kref);
+       PDBG("%s ep %p state %s\n", __FUNCTION__, epc, states[state_read(epc)]);
+       kfree(epc);
+}
+
+static void release_ep_resources(struct iwch_ep *ep)
+{
+       PDBG("%s ep %p tid %d\n", __FUNCTION__, ep, ep->hwtid);
+       cxgb3_remove_tid(ep->com.tdev, (void *)ep, ep->hwtid);
+       dst_release(ep->dst);
+       l2t_release(L2DATA(ep->com.tdev), ep->l2t);
+       if (ep->com.tdev->type == T3B)
+               release_tid(ep->com.tdev, ep->hwtid, NULL);
+       put_ep(&ep->com);
+}
+
+static void process_work(struct work_struct *work)
+{
+       struct sk_buff *skb = NULL;
+       void *ep;
+       struct t3cdev *tdev;
+       int ret;
+
+       while ((skb = skb_dequeue(&rxq))) {
+               ep = *((void **) (skb->cb));
+               tdev = *((struct t3cdev **) (skb->cb + sizeof(void *)));
+               ret = work_handlers[G_OPCODE(ntohl((__force __be32)skb->csum))](tdev, skb, ep);
+               if (ret & CPL_RET_BUF_DONE)
+                       kfree_skb(skb);
+
+               /*
+                * ep was referenced in sched(), and is freed here.
+                */
+               put_ep((struct iwch_ep_common *)ep);
+       }
+}
+
+static int status2errno(int status)
+{
+       switch (status) {
+       case CPL_ERR_NONE:
+               return 0;
+       case CPL_ERR_CONN_RESET:
+               return -ECONNRESET;
+       case CPL_ERR_ARP_MISS:
+               return -EHOSTUNREACH;
+       case CPL_ERR_CONN_TIMEDOUT:
+               return -ETIMEDOUT;
+       case CPL_ERR_TCAM_FULL:
+               return -ENOMEM;
+       case CPL_ERR_CONN_EXIST:
+               return -EADDRINUSE;
+       default:
+               return -EIO;
+       }
+}
+
+/*
+ * Try and reuse skbs already allocated...
+ */
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp)
+{
+       if (skb) {
+               BUG_ON(skb_cloned(skb));
+               skb_trim(skb, 0);
+               skb_get(skb);
+       } else {
+               skb = alloc_skb(len, gfp);
+       }
+       return skb;
+}
+
+static struct rtable *find_route(struct t3cdev *dev, __be32 local_ip,
+                                __be32 peer_ip, __be16 local_port,
+                                __be16 peer_port, u8 tos)
+{
+       struct rtable *rt;
+       struct flowi fl = {
+               .oif = 0,
+               .nl_u = {
+                        .ip4_u = {
+                                  .daddr = peer_ip,
+                                  .saddr = local_ip,
+                                  .tos = tos}
+                        },
+               .proto = IPPROTO_TCP,
+               .uli_u = {
+                         .ports = {
+                                   .sport = local_port,
+                                   .dport = peer_port}
+                         }
+       };
+
+       if (ip_route_output_flow(&rt, &fl, NULL, 0))
+               return NULL;
+       return rt;
+}
+
+static unsigned int find_best_mtu(const struct t3c_data *d, unsigned short mtu)
+{
+       int i = 0;
+
+       while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
+               ++i;
+       return i;
+}
+
+static void arp_failure_discard(struct t3cdev *dev, struct sk_buff *skb)
+{
+       PDBG("%s t3cdev %p\n", __FUNCTION__, dev);
+       kfree_skb(skb);
+}
+
+/*
+ * Handle an ARP failure for an active open.
+ */
+static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
+{
+       printk(KERN_ERR MOD "ARP failure duing connect\n");
+       kfree_skb(skb);
+}
+
+/*
+ * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
+ * and send it along.
+ */
+static void abort_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
+{
+       struct cpl_abort_req *req = cplhdr(skb);
+
+       PDBG("%s t3cdev %p\n", __FUNCTION__, dev);
+       req->cmd = CPL_ABORT_NO_RST;
+       cxgb3_ofld_send(dev, skb);
+}
+
+static int send_halfclose(struct iwch_ep *ep, gfp_t gfp)
+{
+       struct cpl_close_con_req *req;
+       struct sk_buff *skb;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       skb = get_skb(NULL, sizeof(*req), gfp);
+       if (!skb) {
+               printk(KERN_ERR MOD "%s - failed to alloc skb\n", __FUNCTION__);
+               return -ENOMEM;
+       }
+       skb->priority = CPL_PRIORITY_DATA;
+       set_arp_failure_handler(skb, arp_failure_discard);
+       req = (struct cpl_close_con_req *) skb_put(skb, sizeof(*req));
+       req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
+       req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+       OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, ep->hwtid));
+       l2t_send(ep->com.tdev, skb, ep->l2t);
+       return 0;
+}
+
+static int send_abort(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp)
+{
+       struct cpl_abort_req *req;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       skb = get_skb(skb, sizeof(*req), gfp);
+       if (!skb) {
+               printk(KERN_ERR MOD "%s - failed to alloc skb.\n",
+                      __FUNCTION__);
+               return -ENOMEM;
+       }
+       skb->priority = CPL_PRIORITY_DATA;
+       set_arp_failure_handler(skb, abort_arp_failure);
+       req = (struct cpl_abort_req *) skb_put(skb, sizeof(*req));
+       req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
+       req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+       OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid));
+       req->cmd = CPL_ABORT_SEND_RST;
+       l2t_send(ep->com.tdev, skb, ep->l2t);
+       return 0;
+}
+
+static int send_connect(struct iwch_ep *ep)
+{
+       struct cpl_act_open_req *req;
+       struct sk_buff *skb;
+       u32 opt0h, opt0l, opt2;
+       unsigned int mtu_idx;
+       int wscale;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+
+       skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+       if (!skb) {
+               printk(KERN_ERR MOD "%s - failed to alloc skb.\n",
+                      __FUNCTION__);
+               return -ENOMEM;
+       }
+       mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst));
+       wscale = compute_wscale(rcv_win);
+       opt0h = V_NAGLE(0) |
+           V_NO_CONG(nocong) |
+           V_KEEP_ALIVE(1) |
+           F_TCAM_BYPASS |
+           V_WND_SCALE(wscale) |
+           V_MSS_IDX(mtu_idx) |
+           V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx);
+       opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10);
+       opt2 = V_FLAVORS_VALID(1) | V_CONG_CONTROL_FLAVOR(cong_flavor);
+       skb->priority = CPL_PRIORITY_SETUP;
+       set_arp_failure_handler(skb, act_open_req_arp_failure);
+
+       req = (struct cpl_act_open_req *) skb_put(skb, sizeof(*req));
+       req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+       OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, ep->atid));
+       req->local_port = ep->com.local_addr.sin_port;
+       req->peer_port = ep->com.remote_addr.sin_port;
+       req->local_ip = ep->com.local_addr.sin_addr.s_addr;
+       req->peer_ip = ep->com.remote_addr.sin_addr.s_addr;
+       req->opt0h = htonl(opt0h);
+       req->opt0l = htonl(opt0l);
+       req->params = 0;
+       req->opt2 = htonl(opt2);
+       l2t_send(ep->com.tdev, skb, ep->l2t);
+       return 0;
+}
+
+static void send_mpa_req(struct iwch_ep *ep, struct sk_buff *skb)
+{
+       int mpalen;
+       struct tx_data_wr *req;
+       struct mpa_message *mpa;
+       int len;
+
+       PDBG("%s ep %p pd_len %d\n", __FUNCTION__, ep, ep->plen);
+
+       BUG_ON(skb_cloned(skb));
+
+       mpalen = sizeof(*mpa) + ep->plen;
+       if (skb->data + mpalen + sizeof(*req) > skb->end) {
+               kfree_skb(skb);
+               skb=alloc_skb(mpalen + sizeof(*req), GFP_KERNEL);
+               if (!skb) {
+                       connect_reply_upcall(ep, -ENOMEM);
+                       return;
+               }
+       }
+       skb_trim(skb, 0);
+       skb_reserve(skb, sizeof(*req));
+       skb_put(skb, mpalen);
+       skb->priority = CPL_PRIORITY_DATA;
+       mpa = (struct mpa_message *) skb->data;
+       memset(mpa, 0, sizeof(*mpa));
+       memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
+       mpa->flags = (crc_enabled ? MPA_CRC : 0) |
+                    (markers_enabled ? MPA_MARKERS : 0);
+       mpa->private_data_size = htons(ep->plen);
+       mpa->revision = mpa_rev;
+
+       if (ep->plen)
+               memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen);
+
+       /*
+        * Reference the mpa skb.  This ensures the data area
+        * will remain in memory until the hw acks the tx.
+        * Function tx_ack() will deref it.
+        */
+       skb_get(skb);
+       set_arp_failure_handler(skb, arp_failure_discard);
+       skb->h.raw = skb->data;
+       len = skb->len;
+       req = (struct tx_data_wr *) skb_push(skb, sizeof(*req));
+       req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
+       req->wr_lo = htonl(V_WR_TID(ep->hwtid));
+       req->len = htonl(len);
+       req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
+                          V_TX_SNDBUF(snd_win>>15));
+       req->flags = htonl(F_TX_IMM_ACK|F_TX_INIT);
+       req->sndseq = htonl(ep->snd_seq);
+       BUG_ON(ep->mpa_skb);
+       ep->mpa_skb = skb;
+       l2t_send(ep->com.tdev, skb, ep->l2t);
+       start_ep_timer(ep);
+       state_set(&ep->com, MPA_REQ_SENT);
+       return;
+}
+
+static int send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen)
+{
+       int mpalen;
+       struct tx_data_wr *req;
+       struct mpa_message *mpa;
+       struct sk_buff *skb;
+
+       PDBG("%s ep %p plen %d\n", __FUNCTION__, ep, plen);
+
+       mpalen = sizeof(*mpa) + plen;
+
+       skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL);
+       if (!skb) {
+               printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __FUNCTION__);
+               return -ENOMEM;
+       }
+       skb_reserve(skb, sizeof(*req));
+       mpa = (struct mpa_message *) skb_put(skb, mpalen);
+       memset(mpa, 0, sizeof(*mpa));
+       memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+       mpa->flags = MPA_REJECT;
+       mpa->revision = mpa_rev;
+       mpa->private_data_size = htons(plen);
+       if (plen)
+               memcpy(mpa->private_data, pdata, plen);
+
+       /*
+        * Reference the mpa skb again.  This ensures the data area
+        * will remain in memory until the hw acks the tx.
+        * Function tx_ack() will deref it.
+        */
+       skb_get(skb);
+       skb->priority = CPL_PRIORITY_DATA;
+       set_arp_failure_handler(skb, arp_failure_discard);
+       skb->h.raw = skb->data;
+       req = (struct tx_data_wr *) skb_push(skb, sizeof(*req));
+       req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
+       req->wr_lo = htonl(V_WR_TID(ep->hwtid));
+       req->len = htonl(mpalen);
+       req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
+                          V_TX_SNDBUF(snd_win>>15));
+       req->flags = htonl(F_TX_IMM_ACK|F_TX_INIT);
+       req->sndseq = htonl(ep->snd_seq);
+       BUG_ON(ep->mpa_skb);
+       ep->mpa_skb = skb;
+       l2t_send(ep->com.tdev, skb, ep->l2t);
+       return 0;
+}
+
+static int send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen)
+{
+       int mpalen;
+       struct tx_data_wr *req;
+       struct mpa_message *mpa;
+       int len;
+       struct sk_buff *skb;
+
+       PDBG("%s ep %p plen %d\n", __FUNCTION__, ep, plen);
+
+       mpalen = sizeof(*mpa) + plen;
+
+       skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL);
+       if (!skb) {
+               printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __FUNCTION__);
+               return -ENOMEM;
+       }
+       skb->priority = CPL_PRIORITY_DATA;
+       skb_reserve(skb, sizeof(*req));
+       mpa = (struct mpa_message *) skb_put(skb, mpalen);
+       memset(mpa, 0, sizeof(*mpa));
+       memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+       mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) |
+                    (markers_enabled ? MPA_MARKERS : 0);
+       mpa->revision = mpa_rev;
+       mpa->private_data_size = htons(plen);
+       if (plen)
+               memcpy(mpa->private_data, pdata, plen);
+
+       /*
+        * Reference the mpa skb.  This ensures the data area
+        * will remain in memory until the hw acks the tx.
+        * Function tx_ack() will deref it.
+        */
+       skb_get(skb);
+       set_arp_failure_handler(skb, arp_failure_discard);
+       skb->h.raw = skb->data;
+       len = skb->len;
+       req = (struct tx_data_wr *) skb_push(skb, sizeof(*req));
+       req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
+       req->wr_lo = htonl(V_WR_TID(ep->hwtid));
+       req->len = htonl(len);
+       req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
+                          V_TX_SNDBUF(snd_win>>15));
+       req->flags = htonl(F_TX_MORE | F_TX_IMM_ACK | F_TX_INIT);
+       req->sndseq = htonl(ep->snd_seq);
+       ep->mpa_skb = skb;
+       state_set(&ep->com, MPA_REP_SENT);
+       l2t_send(ep->com.tdev, skb, ep->l2t);
+       return 0;
+}
+
+static int act_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+       struct iwch_ep *ep = ctx;
+       struct cpl_act_establish *req = cplhdr(skb);
+       unsigned int tid = GET_TID(req);
+
+       PDBG("%s ep %p tid %d\n", __FUNCTION__, ep, tid);
+
+       dst_confirm(ep->dst);
+
+       /* setup the hwtid for this connection */
+       ep->hwtid = tid;
+       cxgb3_insert_tid(ep->com.tdev, &t3c_client, ep, tid);
+
+       ep->snd_seq = ntohl(req->snd_isn);
+
+       set_emss(ep, ntohs(req->tcp_opt));
+
+       /* dealloc the atid */
+       cxgb3_free_atid(ep->com.tdev, ep->atid);
+
+       /* start MPA negotiation */
+       send_mpa_req(ep, skb);
+
+       return 0;
+}
+
+static void abort_connection(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp)
+{
+       PDBG("%s ep %p\n", __FILE__, ep);
+       state_set(&ep->com, ABORTING);
+       send_abort(ep, skb, gfp);
+}
+
+static void close_complete_upcall(struct iwch_ep *ep)
+{
+       struct iw_cm_event event;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       memset(&event, 0, sizeof(event));
+       event.event = IW_CM_EVENT_CLOSE;
+       if (ep->com.cm_id) {
+               PDBG("close complete delivered ep %p cm_id %p tid %d\n",
+                    ep, ep->com.cm_id, ep->hwtid);
+               ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+               ep->com.cm_id->rem_ref(ep->com.cm_id);
+               ep->com.cm_id = NULL;
+               ep->com.qp = NULL;
+       }
+}
+
+static void peer_close_upcall(struct iwch_ep *ep)
+{
+       struct iw_cm_event event;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       memset(&event, 0, sizeof(event));
+       event.event = IW_CM_EVENT_DISCONNECT;
+       if (ep->com.cm_id) {
+               PDBG("peer close delivered ep %p cm_id %p tid %d\n",
+                    ep, ep->com.cm_id, ep->hwtid);
+               ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+       }
+}
+
+static void peer_abort_upcall(struct iwch_ep *ep)
+{
+       struct iw_cm_event event;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       memset(&event, 0, sizeof(event));
+       event.event = IW_CM_EVENT_CLOSE;
+       event.status = -ECONNRESET;
+       if (ep->com.cm_id) {
+               PDBG("abort delivered ep %p cm_id %p tid %d\n", ep,
+                    ep->com.cm_id, ep->hwtid);
+               ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+               ep->com.cm_id->rem_ref(ep->com.cm_id);
+               ep->com.cm_id = NULL;
+               ep->com.qp = NULL;
+       }
+}
+
+static void connect_reply_upcall(struct iwch_ep *ep, int status)
+{
+       struct iw_cm_event event;
+
+       PDBG("%s ep %p status %d\n", __FUNCTION__, ep, status);
+       memset(&event, 0, sizeof(event));
+       event.event = IW_CM_EVENT_CONNECT_REPLY;
+       event.status = status;
+       event.local_addr = ep->com.local_addr;
+       event.remote_addr = ep->com.remote_addr;
+
+       if ((status == 0) || (status == -ECONNREFUSED)) {
+               event.private_data_len = ep->plen;
+               event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
+       }
+       if (ep->com.cm_id) {
+               PDBG("%s ep %p tid %d status %d\n", __FUNCTION__, ep,
+                    ep->hwtid, status);
+               ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+       }
+       if (status < 0) {
+               ep->com.cm_id->rem_ref(ep->com.cm_id);
+               ep->com.cm_id = NULL;
+               ep->com.qp = NULL;
+       }
+}
+
+static void connect_request_upcall(struct iwch_ep *ep)
+{
+       struct iw_cm_event event;
+
+       PDBG("%s ep %p tid %d\n", __FUNCTION__, ep, ep->hwtid);
+       memset(&event, 0, sizeof(event));
+       event.event = IW_CM_EVENT_CONNECT_REQUEST;
+       event.local_addr = ep->com.local_addr;
+       event.remote_addr = ep->com.remote_addr;
+       event.private_data_len = ep->plen;
+       event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
+       event.provider_data = ep;
+       if (state_read(&ep->parent_ep->com) != DEAD)
+               ep->parent_ep->com.cm_id->event_handler(
+                                               ep->parent_ep->com.cm_id,
+                                               &event);
+       put_ep(&ep->parent_ep->com);
+       ep->parent_ep = NULL;
+}
+
+static void established_upcall(struct iwch_ep *ep)
+{
+       struct iw_cm_event event;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       memset(&event, 0, sizeof(event));
+       event.event = IW_CM_EVENT_ESTABLISHED;
+       if (ep->com.cm_id) {
+               PDBG("%s ep %p tid %d\n", __FUNCTION__, ep, ep->hwtid);
+               ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+       }
+}
+
+static int update_rx_credits(struct iwch_ep *ep, u32 credits)
+{
+       struct cpl_rx_data_ack *req;
+       struct sk_buff *skb;
+
+       PDBG("%s ep %p credits %u\n", __FUNCTION__, ep, credits);
+       skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+       if (!skb) {
+               printk(KERN_ERR MOD "update_rx_credits - cannot alloc skb!\n");
+               return 0;
+       }
+
+       req = (struct cpl_rx_data_ack *) skb_put(skb, sizeof(*req));
+       req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+       OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, ep->hwtid));
+       req->credit_dack = htonl(V_RX_CREDITS(credits) | V_RX_FORCE_ACK(1));
+       skb->priority = CPL_PRIORITY_ACK;
+       ep->com.tdev->send(ep->com.tdev, skb);
+       return credits;
+}
+
+static void process_mpa_reply(struct iwch_ep *ep, struct sk_buff *skb)
+{
+       struct mpa_message *mpa;
+       u16 plen;
+       struct iwch_qp_attributes attrs;
+       enum iwch_qp_attr_mask mask;
+       int err;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+
+       /*
+        * Stop mpa timer.  If it expired, then the state has
+        * changed and we bail since ep_timeout already aborted
+        * the connection.
+        */
+       stop_ep_timer(ep);
+       if (state_read(&ep->com) != MPA_REQ_SENT)
+               return;
+
+       /*
+        * If we get more than the supported amount of private data
+        * then we must fail this connection.
+        */
+       if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
+               err = -EINVAL;
+               goto err;
+       }
+
+       /*
+        * copy the new data into our accumulation buffer.
+        */
+       memcpy(&(ep->mpa_pkt[ep->mpa_pkt_len]), skb->data, skb->len);
+       ep->mpa_pkt_len += skb->len;
+
+       /*
+        * if we don't even have the mpa message, then bail.
+        */
+       if (ep->mpa_pkt_len < sizeof(*mpa))
+               return;
+       mpa = (struct mpa_message *) ep->mpa_pkt;
+
+       /* Validate MPA header. */
+       if (mpa->revision != mpa_rev) {
+               err = -EPROTO;
+               goto err;
+       }
+       if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) {
+               err = -EPROTO;
+               goto err;
+       }
+
+       plen = ntohs(mpa->private_data_size);
+
+       /*
+        * Fail if there's too much private data.
+        */
+       if (plen > MPA_MAX_PRIVATE_DATA) {
+               err = -EPROTO;
+               goto err;
+       }
+
+       /*
+        * If plen does not account for pkt size
+        */
+       if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+               err = -EPROTO;
+               goto err;
+       }
+
+       ep->plen = (u8) plen;
+
+       /*
+        * If we don't have all the pdata yet, then bail.
+        * We'll continue process when more data arrives.
+        */
+       if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+               return;
+
+       if (mpa->flags & MPA_REJECT) {
+               err = -ECONNREFUSED;
+               goto err;
+       }
+
+       /*
+        * If we get here we have accumulated the entire mpa
+        * start reply message including private data. And
+        * the MPA header is valid.
+        */
+       state_set(&ep->com, FPDU_MODE);
+       ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+       ep->mpa_attr.recv_marker_enabled = markers_enabled;
+       ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+       ep->mpa_attr.version = mpa_rev;
+       PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, "
+            "xmit_marker_enabled=%d, version=%d\n", __FUNCTION__,
+            ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
+            ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
+
+       attrs.mpa_attr = ep->mpa_attr;
+       attrs.max_ird = ep->ird;
+       attrs.max_ord = ep->ord;
+       attrs.llp_stream_handle = ep;
+       attrs.next_state = IWCH_QP_STATE_RTS;
+
+       mask = IWCH_QP_ATTR_NEXT_STATE |
+           IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR |
+           IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD;
+
+       /* bind QP and TID with INIT_WR */
+       err = iwch_modify_qp(ep->com.qp->rhp,
+                            ep->com.qp, mask, &attrs, 1);
+       if (!err)
+               goto out;
+err:
+       abort_connection(ep, skb, GFP_KERNEL);
+out:
+       connect_reply_upcall(ep, err);
+       return;
+}
+
+static void process_mpa_request(struct iwch_ep *ep, struct sk_buff *skb)
+{
+       struct mpa_message *mpa;
+       u16 plen;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+
+       /*
+        * Stop mpa timer.  If it expired, then the state has
+        * changed and we bail since ep_timeout already aborted
+        * the connection.
+        */
+       stop_ep_timer(ep);
+       if (state_read(&ep->com) != MPA_REQ_WAIT)
+               return;
+
+       /*
+        * If we get more than the supported amount of private data
+        * then we must fail this connection.
+        */
+       if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
+               abort_connection(ep, skb, GFP_KERNEL);
+               return;
+       }
+
+       PDBG("%s enter (%s line %u)\n", __FUNCTION__, __FILE__, __LINE__);
+
+       /*
+        * Copy the new data into our accumulation buffer.
+        */
+       memcpy(&(ep->mpa_pkt[ep->mpa_pkt_len]), skb->data, skb->len);
+       ep->mpa_pkt_len += skb->len;
+
+       /*
+        * If we don't even have the mpa message, then bail.
+        * We'll continue process when more data arrives.
+        */
+       if (ep->mpa_pkt_len < sizeof(*mpa))
+               return;
+       PDBG("%s enter (%s line %u)\n", __FUNCTION__, __FILE__, __LINE__);
+       mpa = (struct mpa_message *) ep->mpa_pkt;
+
+       /*
+        * Validate MPA Header.
+        */
+       if (mpa->revision != mpa_rev) {
+               abort_connection(ep, skb, GFP_KERNEL);
+               return;
+       }
+
+       if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
+               abort_connection(ep, skb, GFP_KERNEL);
+               return;
+       }
+
+       plen = ntohs(mpa->private_data_size);
+
+       /*
+        * Fail if there's too much private data.
+        */
+       if (plen > MPA_MAX_PRIVATE_DATA) {
+               abort_connection(ep, skb, GFP_KERNEL);
+               return;
+       }
+
+       /*
+        * If plen does not account for pkt size
+        */
+       if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+               abort_connection(ep, skb, GFP_KERNEL);
+               return;
+       }
+       ep->plen = (u8) plen;
+
+       /*
+        * If we don't have all the pdata yet, then bail.
+        */
+       if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+               return;
+
+       /*
+        * If we get here we have accumulated the entire mpa
+        * start reply message including private data.
+        */
+       ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+       ep->mpa_attr.recv_marker_enabled = markers_enabled;
+       ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+       ep->mpa_attr.version = mpa_rev;
+       PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, "
+            "xmit_marker_enabled=%d, version=%d\n", __FUNCTION__,
+            ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
+            ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
+
+       state_set(&ep->com, MPA_REQ_RCVD);
+
+       /* drive upcall */
+       connect_request_upcall(ep);
+       return;
+}
+
+static int rx_data(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+       struct iwch_ep *ep = ctx;
+       struct cpl_rx_data *hdr = cplhdr(skb);
+       unsigned int dlen = ntohs(hdr->len);
+
+       PDBG("%s ep %p dlen %u\n", __FUNCTION__, ep, dlen);
+
+       skb_pull(skb, sizeof(*hdr));
+       skb_trim(skb, dlen);
+
+       switch (state_read(&ep->com)) {
+       case MPA_REQ_SENT:
+               process_mpa_reply(ep, skb);
+               break;
+       case MPA_REQ_WAIT:
+               process_mpa_request(ep, skb);
+               break;
+       case MPA_REP_SENT:
+               break;
+       default:
+               printk(KERN_ERR MOD "%s Unexpected streaming data."
+                      " ep %p state %d tid %d\n",
+                      __FUNCTION__, ep, state_read(&ep->com), ep->hwtid);
+
+               /*
+                * The ep will timeout and inform the ULP of the failure.
+                * See ep_timeout().
+                */
+               break;
+       }
+
+       /* update RX credits */
+       update_rx_credits(ep, dlen);
+
+       return CPL_RET_BUF_DONE;
+}
+
+/*
+ * Upcall from the adapter indicating data has been transmitted.
+ * For us its just the single MPA request or reply.  We can now free
+ * the skb holding the mpa message.
+ */
+static int tx_ack(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+       struct iwch_ep *ep = ctx;
+       struct cpl_wr_ack *hdr = cplhdr(skb);
+       unsigned int credits = ntohs(hdr->credits);
+       enum iwch_qp_attr_mask  mask;
+
+       PDBG("%s ep %p credits %u\n", __FUNCTION__, ep, credits);
+
+       if (credits == 0)
+               return CPL_RET_BUF_DONE;
+       BUG_ON(credits != 1);
+       BUG_ON(ep->mpa_skb == NULL);
+       kfree_skb(ep->mpa_skb);
+       ep->mpa_skb = NULL;
+       dst_confirm(ep->dst);
+       if (state_read(&ep->com) == MPA_REP_SENT) {
+               struct iwch_qp_attributes attrs;
+
+               /* bind QP to EP and move to RTS */
+               attrs.mpa_attr = ep->mpa_attr;
+               attrs.max_ird = ep->ord;
+               attrs.max_ord = ep->ord;
+               attrs.llp_stream_handle = ep;
+               attrs.next_state = IWCH_QP_STATE_RTS;
+
+               /* bind QP and TID with INIT_WR */
+               mask = IWCH_QP_ATTR_NEXT_STATE |
+                                    IWCH_QP_ATTR_LLP_STREAM_HANDLE |
+                                    IWCH_QP_ATTR_MPA_ATTR |
+                                    IWCH_QP_ATTR_MAX_IRD |
+                                    IWCH_QP_ATTR_MAX_ORD;
+
+               ep->com.rpl_err = iwch_modify_qp(ep->com.qp->rhp,
+                                    ep->com.qp, mask, &attrs, 1);
+
+               if (!ep->com.rpl_err) {
+                       state_set(&ep->com, FPDU_MODE);
+                       established_upcall(ep);
+               }
+
+               ep->com.rpl_done = 1;
+               PDBG("waking up ep %p\n", ep);
+               wake_up(&ep->com.waitq);
+       }
+       return CPL_RET_BUF_DONE;
+}
+
+static int abort_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+       struct iwch_ep *ep = ctx;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+
+       close_complete_upcall(ep);
+       state_set(&ep->com, DEAD);
+       release_ep_resources(ep);
+       return CPL_RET_BUF_DONE;
+}
+
+static int act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+       struct iwch_ep *ep = ctx;
+       struct cpl_act_open_rpl *rpl = cplhdr(skb);
+
+       PDBG("%s ep %p status %u errno %d\n", __FUNCTION__, ep, rpl->status,
+            status2errno(rpl->status));
+       connect_reply_upcall(ep, status2errno(rpl->status));
+       state_set(&ep->com, DEAD);
+       if (ep->com.tdev->type == T3B)
+               release_tid(ep->com.tdev, GET_TID(rpl), NULL);
+       cxgb3_free_atid(ep->com.tdev, ep->atid);
+       dst_release(ep->dst);
+       l2t_release(L2DATA(ep->com.tdev), ep->l2t);
+       put_ep(&ep->com);
+       return CPL_RET_BUF_DONE;
+}
+
+static int listen_start(struct iwch_listen_ep *ep)
+{
+       struct sk_buff *skb;
+       struct cpl_pass_open_req *req;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+       if (!skb) {
+               printk(KERN_ERR MOD "t3c_listen_start failed to alloc skb!\n");
+               return -ENOMEM;
+       }
+
+       req = (struct cpl_pass_open_req *) skb_put(skb, sizeof(*req));
+       req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+       OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, ep->stid));
+       req->local_port = ep->com.local_addr.sin_port;
+       req->local_ip = ep->com.local_addr.sin_addr.s_addr;
+       req->peer_port = 0;
+       req->peer_ip = 0;
+       req->peer_netmask = 0;
+       req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS);
+       req->opt0l = htonl(V_RCV_BUFSIZ(rcv_win>>10));
+       req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK));
+
+       skb->priority = 1;
+       ep->com.tdev->send(ep->com.tdev, skb);
+       return 0;
+}
+
+static int pass_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+       struct iwch_listen_ep *ep = ctx;
+       struct cpl_pass_open_rpl *rpl = cplhdr(skb);
+
+       PDBG("%s ep %p status %d error %d\n", __FUNCTION__, ep,
+            rpl->status, status2errno(rpl->status));
+       ep->com.rpl_err = status2errno(rpl->status);
+       ep->com.rpl_done = 1;
+       wake_up(&ep->com.waitq);
+
+       return CPL_RET_BUF_DONE;
+}
+
+static int listen_stop(struct iwch_listen_ep *ep)
+{
+       struct sk_buff *skb;
+       struct cpl_close_listserv_req *req;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+       if (!skb) {
+               printk(KERN_ERR MOD "%s - failed to alloc skb\n", __FUNCTION__);
+               return -ENOMEM;
+       }
+       req = (struct cpl_close_listserv_req *) skb_put(skb, sizeof(*req));
+       req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+       OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, ep->stid));
+       skb->priority = 1;
+       ep->com.tdev->send(ep->com.tdev, skb);
+       return 0;
+}
+
+static int close_listsrv_rpl(struct t3cdev *tdev, struct sk_buff *skb,
+                            void *ctx)
+{
+       struct iwch_listen_ep *ep = ctx;
+       struct cpl_close_listserv_rpl *rpl = cplhdr(skb);
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       ep->com.rpl_err = status2errno(rpl->status);
+       ep->com.rpl_done = 1;
+       wake_up(&ep->com.waitq);
+       return CPL_RET_BUF_DONE;
+}
+
+static void accept_cr(struct iwch_ep *ep, __be32 peer_ip, struct sk_buff *skb)
+{
+       struct cpl_pass_accept_rpl *rpl;
+       unsigned int mtu_idx;
+       u32 opt0h, opt0l, opt2;
+       int wscale;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       BUG_ON(skb_cloned(skb));
+       skb_trim(skb, sizeof(*rpl));
+       skb_get(skb);
+       mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst));
+       wscale = compute_wscale(rcv_win);
+       opt0h = V_NAGLE(0) |
+           V_NO_CONG(nocong) |
+           V_KEEP_ALIVE(1) |
+           F_TCAM_BYPASS |
+           V_WND_SCALE(wscale) |
+           V_MSS_IDX(mtu_idx) |
+           V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx);
+       opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10);
+       opt2 = V_FLAVORS_VALID(1) | V_CONG_CONTROL_FLAVOR(cong_flavor);
+
+       rpl = cplhdr(skb);
+       rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+       OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, ep->hwtid));
+       rpl->peer_ip = peer_ip;
+       rpl->opt0h = htonl(opt0h);
+       rpl->opt0l_status = htonl(opt0l | CPL_PASS_OPEN_ACCEPT);
+       rpl->opt2 = htonl(opt2);
+       rpl->rsvd = rpl->opt2;  /* workaround for HW bug */
+       skb->priority = CPL_PRIORITY_SETUP;
+       l2t_send(ep->com.tdev, skb, ep->l2t);
+
+       return;
+}
+
+static void reject_cr(struct t3cdev *tdev, u32 hwtid, __be32 peer_ip,
+                     struct sk_buff *skb)
+{
+       PDBG("%s t3cdev %p tid %u peer_ip %x\n", __FUNCTION__, tdev, hwtid,
+            peer_ip);
+       BUG_ON(skb_cloned(skb));
+       skb_trim(skb, sizeof(struct cpl_tid_release));
+       skb_get(skb);
+
+       if (tdev->type == T3B)
+               release_tid(tdev, hwtid, skb);
+       else {
+               struct cpl_pass_accept_rpl *rpl;
+
+               rpl = cplhdr(skb);
+               skb->priority = CPL_PRIORITY_SETUP;
+               rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+               OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL,
+                                                     hwtid));
+               rpl->peer_ip = peer_ip;
+               rpl->opt0h = htonl(F_TCAM_BYPASS);
+               rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
+               rpl->opt2 = 0;
+               rpl->rsvd = rpl->opt2;
+               tdev->send(tdev, skb);
+       }
+}
+
+static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+       struct iwch_ep *child_ep, *parent_ep = ctx;
+       struct cpl_pass_accept_req *req = cplhdr(skb);
+       unsigned int hwtid = GET_TID(req);
+       struct dst_entry *dst;
+       struct l2t_entry *l2t;
+       struct rtable *rt;
+       struct iff_mac tim;
+
+       PDBG("%s parent ep %p tid %u\n", __FUNCTION__, parent_ep, hwtid);
+
+       if (state_read(&parent_ep->com) != LISTEN) {
+               printk(KERN_ERR "%s - listening ep not in LISTEN\n",
+                      __FUNCTION__);
+               goto reject;
+       }
+
+       /*
+        * Find the netdev for this connection request.
+        */
+       tim.mac_addr = req->dst_mac;
+       tim.vlan_tag = ntohs(req->vlan_tag);
+       if (tdev->ctl(tdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
+               printk(KERN_ERR
+                       "%s bad dst mac %02x %02x %02x %02x %02x %02x\n",
+                       __FUNCTION__,
+                       req->dst_mac[0],
+                       req->dst_mac[1],
+                       req->dst_mac[2],
+                       req->dst_mac[3],
+                       req->dst_mac[4],
+                       req->dst_mac[5]);
+               goto reject;
+       }
+
+       /* Find output route */
+       rt = find_route(tdev,
+                       req->local_ip,
+                       req->peer_ip,
+                       req->local_port,
+                       req->peer_port, G_PASS_OPEN_TOS(ntohl(req->tos_tid)));
+       if (!rt) {
+               printk(KERN_ERR MOD "%s - failed to find dst entry!\n",
+                      __FUNCTION__);
+               goto reject;
+       }
+       dst = &rt->u.dst;
+       l2t = t3_l2t_get(tdev, dst->neighbour, dst->neighbour->dev);
+       if (!l2t) {
+               printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n",
+                      __FUNCTION__);
+               dst_release(dst);
+               goto reject;
+       }
+       child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL);
+       if (!child_ep) {
+               printk(KERN_ERR MOD "%s - failed to allocate ep entry!\n",
+                      __FUNCTION__);
+               l2t_release(L2DATA(tdev), l2t);
+               dst_release(dst);
+               goto reject;
+       }
+       state_set(&child_ep->com, CONNECTING);
+       child_ep->com.tdev = tdev;
+       child_ep->com.cm_id = NULL;
+       child_ep->com.local_addr.sin_family = PF_INET;
+       child_ep->com.local_addr.sin_port = req->local_port;
+       child_ep->com.local_addr.sin_addr.s_addr = req->local_ip;
+       child_ep->com.remote_addr.sin_family = PF_INET;
+       child_ep->com.remote_addr.sin_port = req->peer_port;
+       child_ep->com.remote_addr.sin_addr.s_addr = req->peer_ip;
+       get_ep(&parent_ep->com);
+       child_ep->parent_ep = parent_ep;
+       child_ep->tos = G_PASS_OPEN_TOS(ntohl(req->tos_tid));
+       child_ep->l2t = l2t;
+       child_ep->dst = dst;
+       child_ep->hwtid = hwtid;
+       init_timer(&child_ep->timer);
+       cxgb3_insert_tid(tdev, &t3c_client, child_ep, hwtid);
+       accept_cr(child_ep, req->peer_ip, skb);
+       goto out;
+reject:
+       reject_cr(tdev, hwtid, req->peer_ip, skb);
+out:
+       return CPL_RET_BUF_DONE;
+}
+
+static int pass_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+       struct iwch_ep *ep = ctx;
+       struct cpl_pass_establish *req = cplhdr(skb);
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       ep->snd_seq = ntohl(req->snd_isn);
+
+       set_emss(ep, ntohs(req->tcp_opt));
+
+       dst_confirm(ep->dst);
+       state_set(&ep->com, MPA_REQ_WAIT);
+       start_ep_timer(ep);
+
+       return CPL_RET_BUF_DONE;
+}
+
+static int peer_close(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+       struct iwch_ep *ep = ctx;
+       struct iwch_qp_attributes attrs;
+       unsigned long flags;
+       int disconnect = 1;
+       int release = 0;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       dst_confirm(ep->dst);
+
+       spin_lock_irqsave(&ep->com.lock, flags);
+       switch (ep->com.state) {
+       case MPA_REQ_WAIT:
+               __state_set(&ep->com, CLOSING);
+               break;
+       case MPA_REQ_SENT:
+               __state_set(&ep->com, CLOSING);
+               connect_reply_upcall(ep, -ECONNRESET);
+               break;
+       case MPA_REQ_RCVD:
+
+               /*
+                * We're gonna mark this puppy DEAD, but keep
+                * the reference on it until the ULP accepts or
+                * rejects the CR.
+                */
+               __state_set(&ep->com, CLOSING);
+               get_ep(&ep->com);
+               break;
+       case MPA_REP_SENT:
+               __state_set(&ep->com, CLOSING);
+               ep->com.rpl_done = 1;
+               ep->com.rpl_err = -ECONNRESET;
+               PDBG("waking up ep %p\n", ep);
+               wake_up(&ep->com.waitq);
+               break;
+       case FPDU_MODE:
+               __state_set(&ep->com, CLOSING);
+               attrs.next_state = IWCH_QP_STATE_CLOSING;
+               iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
+                              IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
+               peer_close_upcall(ep);
+               break;
+       case ABORTING:
+               disconnect = 0;
+               break;
+       case CLOSING:
+               start_ep_timer(ep);
+               __state_set(&ep->com, MORIBUND);
+               disconnect = 0;
+               break;
+       case MORIBUND:
+               stop_ep_timer(ep);
+               if (ep->com.cm_id && ep->com.qp) {
+                       attrs.next_state = IWCH_QP_STATE_IDLE;
+                       iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
+                                      IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
+               }
+               close_complete_upcall(ep);
+               __state_set(&ep->com, DEAD);
+               release = 1;
+               disconnect = 0;
+               break;
+       case DEAD:
+               disconnect = 0;
+               break;
+       default:
+               BUG_ON(1);
+       }
+       spin_unlock_irqrestore(&ep->com.lock, flags);
+       if (disconnect)
+               iwch_ep_disconnect(ep, 0, GFP_KERNEL);
+       if (release)
+               release_ep_resources(ep);
+       return CPL_RET_BUF_DONE;
+}
+
+/*
+ * Returns whether an ABORT_REQ_RSS message is a negative advice.
+ */
+static inline int is_neg_adv_abort(unsigned int status)
+{
+       return status == CPL_ERR_RTX_NEG_ADVICE ||
+              status == CPL_ERR_PERSIST_NEG_ADVICE;
+}
+
+static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+       struct cpl_abort_req_rss *req = cplhdr(skb);
+       struct iwch_ep *ep = ctx;
+       struct cpl_abort_rpl *rpl;
+       struct sk_buff *rpl_skb;
+       struct iwch_qp_attributes attrs;
+       int ret;
+       int state;
+
+       if (is_neg_adv_abort(req->status)) {
+               PDBG("%s neg_adv_abort ep %p tid %d\n", __FUNCTION__, ep,
+                    ep->hwtid);
+               t3_l2t_send_event(ep->com.tdev, ep->l2t);
+               return CPL_RET_BUF_DONE;
+       }
+
+       state = state_read(&ep->com);
+       PDBG("%s ep %p state %u\n", __FUNCTION__, ep, state);
+       switch (state) {
+       case CONNECTING:
+               break;
+       case MPA_REQ_WAIT:
+               break;
+       case MPA_REQ_SENT:
+               connect_reply_upcall(ep, -ECONNRESET);
+               break;
+       case MPA_REP_SENT:
+               ep->com.rpl_done = 1;
+               ep->com.rpl_err = -ECONNRESET;
+               PDBG("waking up ep %p\n", ep);
+               wake_up(&ep->com.waitq);
+               break;
+       case MPA_REQ_RCVD:
+
+               /*
+                * We're gonna mark this puppy DEAD, but keep
+                * the reference on it until the ULP accepts or
+                * rejects the CR.
+                */
+               get_ep(&ep->com);
+               break;
+       case MORIBUND:
+               stop_ep_timer(ep);
+       case FPDU_MODE:
+       case CLOSING:
+               if (ep->com.cm_id && ep->com.qp) {
+                       attrs.next_state = IWCH_QP_STATE_ERROR;
+                       ret = iwch_modify_qp(ep->com.qp->rhp,
+                                    ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
+                                    &attrs, 1);
+                       if (ret)
+                               printk(KERN_ERR MOD
+                                      "%s - qp <- error failed!\n",
+                                      __FUNCTION__);
+               }
+               peer_abort_upcall(ep);
+               break;
+       case ABORTING:
+               break;
+       case DEAD:
+               PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __FUNCTION__);
+               return CPL_RET_BUF_DONE;
+       default:
+               BUG_ON(1);
+               break;
+       }
+       dst_confirm(ep->dst);
+
+       rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL);
+       if (!rpl_skb) {
+               printk(KERN_ERR MOD "%s - cannot allocate skb!\n",
+                      __FUNCTION__);
+               dst_release(ep->dst);
+               l2t_release(L2DATA(ep->com.tdev), ep->l2t);
+               put_ep(&ep->com);
+               return CPL_RET_BUF_DONE;
+       }
+       rpl_skb->priority = CPL_PRIORITY_DATA;
+       rpl = (struct cpl_abort_rpl *) skb_put(rpl_skb, sizeof(*rpl));
+       rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
+       rpl->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+       OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid));
+       rpl->cmd = CPL_ABORT_NO_RST;
+       ep->com.tdev->send(ep->com.tdev, rpl_skb);
+       if (state != ABORTING) {
+               state_set(&ep->com, DEAD);
+               release_ep_resources(ep);
+       }
+       return CPL_RET_BUF_DONE;
+}
+
+static int close_con_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+       struct iwch_ep *ep = ctx;
+       struct iwch_qp_attributes attrs;
+       unsigned long flags;
+       int release = 0;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       BUG_ON(!ep);
+
+       /* The cm_id may be null if we failed to connect */
+       spin_lock_irqsave(&ep->com.lock, flags);
+       switch (ep->com.state) {
+       case CLOSING:
+               start_ep_timer(ep);
+               __state_set(&ep->com, MORIBUND);
+               break;
+       case MORIBUND:
+               stop_ep_timer(ep);
+               if ((ep->com.cm_id) && (ep->com.qp)) {
+                       attrs.next_state = IWCH_QP_STATE_IDLE;
+                       iwch_modify_qp(ep->com.qp->rhp,
+                                            ep->com.qp,
+                                            IWCH_QP_ATTR_NEXT_STATE,
+                                            &attrs, 1);
+               }
+               close_complete_upcall(ep);
+               __state_set(&ep->com, DEAD);
+               release = 1;
+               break;
+       case DEAD:
+       default:
+               BUG_ON(1);
+               break;
+       }
+       spin_unlock_irqrestore(&ep->com.lock, flags);
+       if (release)
+               release_ep_resources(ep);
+       return CPL_RET_BUF_DONE;
+}
+
+/*
+ * T3A does 3 things when a TERM is received:
+ * 1) send up a CPL_RDMA_TERMINATE message with the TERM packet
+ * 2) generate an async event on the QP with the TERMINATE opcode
+ * 3) post a TERMINATE opcde cqe into the associated CQ.
+ *
+ * For (1), we save the message in the qp for later consumer consumption.
+ * For (2), we move the QP into TERMINATE, post a QP event and disconnect.
+ * For (3), we toss the CQE in cxio_poll_cq().
+ *
+ * terminate() handles case (1)...
+ */
+static int terminate(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+       struct iwch_ep *ep = ctx;
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       skb_pull(skb, sizeof(struct cpl_rdma_terminate));
+       PDBG("%s saving %d bytes of term msg\n", __FUNCTION__, skb->len);
+       memcpy(ep->com.qp->attr.terminate_buffer, skb->data, skb->len);
+       ep->com.qp->attr.terminate_msg_len = skb->len;
+       ep->com.qp->attr.is_terminate_local = 0;
+       return CPL_RET_BUF_DONE;
+}
+
+static int ec_status(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+       struct cpl_rdma_ec_status *rep = cplhdr(skb);
+       struct iwch_ep *ep = ctx;
+
+       PDBG("%s ep %p tid %u status %d\n", __FUNCTION__, ep, ep->hwtid,
+            rep->status);
+       if (rep->status) {
+               struct iwch_qp_attributes attrs;
+
+               printk(KERN_ERR MOD "%s BAD CLOSE - Aborting tid %u\n",
+                      __FUNCTION__, ep->hwtid);
+               attrs.next_state = IWCH_QP_STATE_ERROR;
+               iwch_modify_qp(ep->com.qp->rhp,
+                              ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
+                              &attrs, 1);
+               abort_connection(ep, NULL, GFP_KERNEL);
+       }
+       return CPL_RET_BUF_DONE;
+}
+
+static void ep_timeout(unsigned long arg)
+{
+       struct iwch_ep *ep = (struct iwch_ep *)arg;
+       struct iwch_qp_attributes attrs;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ep->com.lock, flags);
+       PDBG("%s ep %p tid %u state %d\n", __FUNCTION__, ep, ep->hwtid,
+            ep->com.state);
+       switch (ep->com.state) {
+       case MPA_REQ_SENT:
+               connect_reply_upcall(ep, -ETIMEDOUT);
+               break;
+       case MPA_REQ_WAIT:
+               break;
+       case MORIBUND:
+               if (ep->com.cm_id && ep->com.qp) {
+                       attrs.next_state = IWCH_QP_STATE_ERROR;
+                       iwch_modify_qp(ep->com.qp->rhp,
+                                    ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
+                                    &attrs, 1);
+               }
+               break;
+       default:
+               BUG();
+       }
+       __state_set(&ep->com, CLOSING);
+       spin_unlock_irqrestore(&ep->com.lock, flags);
+       abort_connection(ep, NULL, GFP_ATOMIC);
+       put_ep(&ep->com);
+}
+
+int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+       int err;
+       struct iwch_ep *ep = to_ep(cm_id);
+       PDBG("%s ep %p tid %u\n", __FUNCTION__, ep, ep->hwtid);
+
+       if (state_read(&ep->com) == DEAD) {
+               put_ep(&ep->com);
+               return -ECONNRESET;
+       }
+       BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD);
+       state_set(&ep->com, CLOSING);
+       if (mpa_rev == 0)
+               abort_connection(ep, NULL, GFP_KERNEL);
+       else {
+               err = send_mpa_reject(ep, pdata, pdata_len);
+               err = send_halfclose(ep, GFP_KERNEL);
+       }
+       return 0;
+}
+
+int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+       int err;
+       struct iwch_qp_attributes attrs;
+       enum iwch_qp_attr_mask mask;
+       struct iwch_ep *ep = to_ep(cm_id);
+       struct iwch_dev *h = to_iwch_dev(cm_id->device);
+       struct iwch_qp *qp = get_qhp(h, conn_param->qpn);
+
+       PDBG("%s ep %p tid %u\n", __FUNCTION__, ep, ep->hwtid);
+       if (state_read(&ep->com) == DEAD) {
+               put_ep(&ep->com);
+               return -ECONNRESET;
+       }
+
+       BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD);
+       BUG_ON(!qp);
+
+       if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) ||
+           (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) {
+               abort_connection(ep, NULL, GFP_KERNEL);
+               return -EINVAL;
+       }
+
+       cm_id->add_ref(cm_id);
+       ep->com.cm_id = cm_id;
+       ep->com.qp = qp;
+
+       ep->com.rpl_done = 0;
+       ep->com.rpl_err = 0;
+       ep->ird = conn_param->ird;
+       ep->ord = conn_param->ord;
+       PDBG("%s %d ird %d ord %d\n", __FUNCTION__, __LINE__, ep->ird, ep->ord);
+       get_ep(&ep->com);
+       err = send_mpa_reply(ep, conn_param->private_data,
+                            conn_param->private_data_len);
+       if (err) {
+               ep->com.cm_id = NULL;
+               ep->com.qp = NULL;
+               cm_id->rem_ref(cm_id);
+               abort_connection(ep, NULL, GFP_KERNEL);
+               put_ep(&ep->com);
+               return err;
+       }
+
+       /* bind QP to EP and move to RTS */
+       attrs.mpa_attr = ep->mpa_attr;
+       attrs.max_ird = ep->ord;
+       attrs.max_ord = ep->ord;
+       attrs.llp_stream_handle = ep;
+       attrs.next_state = IWCH_QP_STATE_RTS;
+
+       /* bind QP and TID with INIT_WR */
+       mask = IWCH_QP_ATTR_NEXT_STATE |
+                            IWCH_QP_ATTR_LLP_STREAM_HANDLE |
+                            IWCH_QP_ATTR_MPA_ATTR |
+                            IWCH_QP_ATTR_MAX_IRD |
+                            IWCH_QP_ATTR_MAX_ORD;
+
+       err = iwch_modify_qp(ep->com.qp->rhp,
+                            ep->com.qp, mask, &attrs, 1);
+
+       if (err) {
+               ep->com.cm_id = NULL;
+               ep->com.qp = NULL;
+               cm_id->rem_ref(cm_id);
+               abort_connection(ep, NULL, GFP_KERNEL);
+       } else {
+               state_set(&ep->com, FPDU_MODE);
+               established_upcall(ep);
+       }
+       put_ep(&ep->com);
+       return err;
+}
+
+int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+       int err = 0;
+       struct iwch_dev *h = to_iwch_dev(cm_id->device);
+       struct iwch_ep *ep;
+       struct rtable *rt;
+
+       ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+       if (!ep) {
+               printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __FUNCTION__);
+               err = -ENOMEM;
+               goto out;
+       }
+       init_timer(&ep->timer);
+       ep->plen = conn_param->private_data_len;
+       if (ep->plen)
+               memcpy(ep->mpa_pkt + sizeof(struct mpa_message),
+                      conn_param->private_data, ep->plen);
+       ep->ird = conn_param->ird;
+       ep->ord = conn_param->ord;
+       ep->com.tdev = h->rdev.t3cdev_p;
+
+       cm_id->add_ref(cm_id);
+       ep->com.cm_id = cm_id;
+       ep->com.qp = get_qhp(h, conn_param->qpn);
+       BUG_ON(!ep->com.qp);
+       PDBG("%s qpn 0x%x qp %p cm_id %p\n", __FUNCTION__, conn_param->qpn,
+            ep->com.qp, cm_id);
+
+       /*
+        * Allocate an active TID to initiate a TCP connection.
+        */
+       ep->atid = cxgb3_alloc_atid(h->rdev.t3cdev_p, &t3c_client, ep);
+       if (ep->atid == -1) {
+               printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __FUNCTION__);
+               err = -ENOMEM;
+               goto fail2;
+       }
+
+       /* find a route */
+       rt = find_route(h->rdev.t3cdev_p,
+                       cm_id->local_addr.sin_addr.s_addr,
+                       cm_id->remote_addr.sin_addr.s_addr,
+                       cm_id->local_addr.sin_port,
+                       cm_id->remote_addr.sin_port, IPTOS_LOWDELAY);
+       if (!rt) {
+               printk(KERN_ERR MOD "%s - cannot find route.\n", __FUNCTION__);
+               err = -EHOSTUNREACH;
+               goto fail3;
+       }
+       ep->dst = &rt->u.dst;
+
+       /* get a l2t entry */
+       ep->l2t = t3_l2t_get(ep->com.tdev, ep->dst->neighbour,
+                            ep->dst->neighbour->dev);
+       if (!ep->l2t) {
+               printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __FUNCTION__);
+               err = -ENOMEM;
+               goto fail4;
+       }
+
+       state_set(&ep->com, CONNECTING);
+       ep->tos = IPTOS_LOWDELAY;
+       ep->com.local_addr = cm_id->local_addr;
+       ep->com.remote_addr = cm_id->remote_addr;
+
+       /* send connect request to rnic */
+       err = send_connect(ep);
+       if (!err)
+               goto out;
+
+       l2t_release(L2DATA(h->rdev.t3cdev_p), ep->l2t);
+fail4:
+       dst_release(ep->dst);
+fail3:
+       cxgb3_free_atid(ep->com.tdev, ep->atid);
+fail2:
+       put_ep(&ep->com);
+out:
+       return err;
+}
+
+int iwch_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+       int err = 0;
+       struct iwch_dev *h = to_iwch_dev(cm_id->device);
+       struct iwch_listen_ep *ep;
+
+
+       might_sleep();
+
+       ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+       if (!ep) {
+               printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __FUNCTION__);
+               err = -ENOMEM;
+               goto fail1;
+       }
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+       ep->com.tdev = h->rdev.t3cdev_p;
+       cm_id->add_ref(cm_id);
+       ep->com.cm_id = cm_id;
+       ep->backlog = backlog;
+       ep->com.local_addr = cm_id->local_addr;
+
+       /*
+        * Allocate a server TID.
+        */
+       ep->stid = cxgb3_alloc_stid(h->rdev.t3cdev_p, &t3c_client, ep);
+       if (ep->stid == -1) {
+               printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __FUNCTION__);
+               err = -ENOMEM;
+               goto fail2;
+       }
+
+       state_set(&ep->com, LISTEN);
+       err = listen_start(ep);
+       if (err)
+               goto fail3;
+
+       /* wait for pass_open_rpl */
+       wait_event(ep->com.waitq, ep->com.rpl_done);
+       err = ep->com.rpl_err;
+       if (!err) {
+               cm_id->provider_data = ep;
+               goto out;
+       }
+fail3:
+       cxgb3_free_stid(ep->com.tdev, ep->stid);
+fail2:
+       put_ep(&ep->com);
+fail1:
+out:
+       return err;
+}
+
+int iwch_destroy_listen(struct iw_cm_id *cm_id)
+{
+       int err;
+       struct iwch_listen_ep *ep = to_listen_ep(cm_id);
+
+       PDBG("%s ep %p\n", __FUNCTION__, ep);
+
+       might_sleep();
+       state_set(&ep->com, DEAD);
+       ep->com.rpl_done = 0;
+       ep->com.rpl_err = 0;
+       err = listen_stop(ep);
+       wait_event(ep->com.waitq, ep->com.rpl_done);
+       cxgb3_free_stid(ep->com.tdev, ep->stid);
+       err = ep->com.rpl_err;
+       cm_id->rem_ref(cm_id);
+       put_ep(&ep->com);
+       return err;
+}
+
+int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp)
+{
+       int ret=0;
+       unsigned long flags;
+       int close = 0;
+
+       spin_lock_irqsave(&ep->com.lock, flags);
+
+       PDBG("%s ep %p state %s, abrupt %d\n", __FUNCTION__, ep,
+            states[ep->com.state], abrupt);
+
+       if (ep->com.state == DEAD) {
+               PDBG("%s already dead ep %p\n", __FUNCTION__, ep);
+               goto out;
+       }
+
+       if (abrupt) {
+               if (ep->com.state != ABORTING) {
+                       ep->com.state = ABORTING;
+                       close = 1;
+               }
+               goto out;
+       }
+
+       switch (ep->com.state) {
+       case MPA_REQ_WAIT:
+       case MPA_REQ_SENT:
+       case MPA_REQ_RCVD:
+       case MPA_REP_SENT:
+       case FPDU_MODE:
+               ep->com.state = CLOSING;
+               close = 1;
+               break;
+       case CLOSING:
+               start_ep_timer(ep);
+               ep->com.state = MORIBUND;
+               close = 1;
+               break;
+       case MORIBUND:
+               break;
+       default:
+               BUG();
+               break;
+       }
+out:
+       spin_unlock_irqrestore(&ep->com.lock, flags);
+       if (close) {
+               if (abrupt)
+                       ret = send_abort(ep, NULL, gfp);
+               else
+                       ret = send_halfclose(ep, gfp);
+       }
+       return ret;
+}
+
+int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new,
+                    struct l2t_entry *l2t)
+{
+       struct iwch_ep *ep = ctx;
+
+       if (ep->dst != old)
+               return 0;
+
+       PDBG("%s ep %p redirect to dst %p l2t %p\n", __FUNCTION__, ep, new,
+            l2t);
+       dst_hold(new);
+       l2t_release(L2DATA(ep->com.tdev), ep->l2t);
+       ep->l2t = l2t;
+       dst_release(old);
+       ep->dst = new;
+       return 1;
+}
+
+/*
+ * All the CM events are handled on a work queue to have a safe context.
+ */
+static int sched(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+       struct iwch_ep_common *epc = ctx;
+
+       get_ep(epc);
+
+       /*
+        * Save ctx and tdev in the skb->cb area.
+        */
+       *((void **) skb->cb) = ctx;
+       *((struct t3cdev **) (skb->cb + sizeof(void *))) = tdev;
+
+       /*
+        * Queue the skb and schedule the worker thread.
+        */
+       skb_queue_tail(&rxq, skb);
+       queue_work(workq, &skb_work);
+       return 0;
+}
+
+int __init iwch_cm_init(void)
+{
+       skb_queue_head_init(&rxq);
+
+       workq = create_singlethread_workqueue("iw_cxgb3");
+       if (!workq)
+               return -ENOMEM;
+
+       /*
+        * All upcalls from the T3 Core go to sched() to
+        * schedule the processing on a work queue.
+        */
+       t3c_handlers[CPL_ACT_ESTABLISH] = sched;
+       t3c_handlers[CPL_ACT_OPEN_RPL] = sched;
+       t3c_handlers[CPL_RX_DATA] = sched;
+       t3c_handlers[CPL_TX_DMA_ACK] = sched;
+       t3c_handlers[CPL_ABORT_RPL_RSS] = sched;
+       t3c_handlers[CPL_ABORT_RPL] = sched;
+       t3c_handlers[CPL_PASS_OPEN_RPL] = sched;
+       t3c_handlers[CPL_CLOSE_LISTSRV_RPL] = sched;
+       t3c_handlers[CPL_PASS_ACCEPT_REQ] = sched;
+       t3c_handlers[CPL_PASS_ESTABLISH] = sched;
+       t3c_handlers[CPL_PEER_CLOSE] = sched;
+       t3c_handlers[CPL_CLOSE_CON_RPL] = sched;
+       t3c_handlers[CPL_ABORT_REQ_RSS] = sched;
+       t3c_handlers[CPL_RDMA_TERMINATE] = sched;
+       t3c_handlers[CPL_RDMA_EC_STATUS] = sched;
+
+       /*
+        * These are the real handlers that are called from a
+        * work queue.
+        */
+       work_handlers[CPL_ACT_ESTABLISH] = act_establish;
+       work_handlers[CPL_ACT_OPEN_RPL] = act_open_rpl;
+       work_handlers[CPL_RX_DATA] = rx_data;
+       work_handlers[CPL_TX_DMA_ACK] = tx_ack;
+       work_handlers[CPL_ABORT_RPL_RSS] = abort_rpl;
+       work_handlers[CPL_ABORT_RPL] = abort_rpl;
+       work_handlers[CPL_PASS_OPEN_RPL] = pass_open_rpl;
+       work_handlers[CPL_CLOSE_LISTSRV_RPL] = close_listsrv_rpl;
+       work_handlers[CPL_PASS_ACCEPT_REQ] = pass_accept_req;
+       work_handlers[CPL_PASS_ESTABLISH] = pass_establish;
+       work_handlers[CPL_PEER_CLOSE] = peer_close;
+       work_handlers[CPL_ABORT_REQ_RSS] = peer_abort;
+       work_handlers[CPL_CLOSE_CON_RPL] = close_con_rpl;
+       work_handlers[CPL_RDMA_TERMINATE] = terminate;
+       work_handlers[CPL_RDMA_EC_STATUS] = ec_status;
+       return 0;
+}
+
+void __exit iwch_cm_term(void)
+{
+       flush_workqueue(workq);
+       destroy_workqueue(workq);
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h
new file mode 100644 (file)
index 0000000..7c810d9
--- /dev/null
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _IWCH_CM_H_
+#define _IWCH_CM_H_
+
+#include <linux/inet.h>
+#include <linux/wait.h>
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/iw_cm.h>
+
+#include "cxgb3_offload.h"
+#include "iwch_provider.h"
+
+#define MPA_KEY_REQ "MPA ID Req Frame"
+#define MPA_KEY_REP "MPA ID Rep Frame"
+
+#define MPA_MAX_PRIVATE_DATA   256
+#define MPA_REV                0       /* XXX - amso1100 uses rev 0 ! */
+#define MPA_REJECT             0x20
+#define MPA_CRC                        0x40
+#define MPA_MARKERS            0x80
+#define MPA_FLAGS_MASK         0xE0
+
+#define put_ep(ep) { \
+       PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __FUNCTION__, __LINE__,  \
+            ep, atomic_read(&((ep)->kref.refcount))); \
+       kref_put(&((ep)->kref), __free_ep); \
+}
+
+#define get_ep(ep) { \
+       PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __FUNCTION__, __LINE__, \
+            ep, atomic_read(&((ep)->kref.refcount))); \
+       kref_get(&((ep)->kref));  \
+}
+
+struct mpa_message {
+       u8 key[16];
+       u8 flags;
+       u8 revision;
+       __be16 private_data_size;
+       u8 private_data[0];
+};
+
+struct terminate_message {
+       u8 layer_etype;
+       u8 ecode;
+       __be16 hdrct_rsvd;
+       u8 len_hdrs[0];
+};
+
+#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28)
+
+enum iwch_layers_types {
+       LAYER_RDMAP             = 0x00,
+       LAYER_DDP               = 0x10,
+       LAYER_MPA               = 0x20,
+       RDMAP_LOCAL_CATA        = 0x00,
+       RDMAP_REMOTE_PROT       = 0x01,
+       RDMAP_REMOTE_OP         = 0x02,
+       DDP_LOCAL_CATA          = 0x00,
+       DDP_TAGGED_ERR          = 0x01,
+       DDP_UNTAGGED_ERR        = 0x02,
+       DDP_LLP                 = 0x03
+};
+
+enum iwch_rdma_ecodes {
+       RDMAP_INV_STAG          = 0x00,
+       RDMAP_BASE_BOUNDS       = 0x01,
+       RDMAP_ACC_VIOL          = 0x02,
+       RDMAP_STAG_NOT_ASSOC    = 0x03,
+       RDMAP_TO_WRAP           = 0x04,
+       RDMAP_INV_VERS          = 0x05,
+       RDMAP_INV_OPCODE        = 0x06,
+       RDMAP_STREAM_CATA       = 0x07,
+       RDMAP_GLOBAL_CATA       = 0x08,
+       RDMAP_CANT_INV_STAG     = 0x09,
+       RDMAP_UNSPECIFIED       = 0xff
+};
+
+enum iwch_ddp_ecodes {
+       DDPT_INV_STAG           = 0x00,
+       DDPT_BASE_BOUNDS        = 0x01,
+       DDPT_STAG_NOT_ASSOC     = 0x02,
+       DDPT_TO_WRAP            = 0x03,
+       DDPT_INV_VERS           = 0x04,
+       DDPU_INV_QN             = 0x01,
+       DDPU_INV_MSN_NOBUF      = 0x02,
+       DDPU_INV_MSN_RANGE      = 0x03,
+       DDPU_INV_MO             = 0x04,
+       DDPU_MSG_TOOBIG         = 0x05,
+       DDPU_INV_VERS           = 0x06
+};
+
+enum iwch_mpa_ecodes {
+       MPA_CRC_ERR             = 0x02,
+       MPA_MARKER_ERR          = 0x03
+};
+
+enum iwch_ep_state {
+       IDLE = 0,
+       LISTEN,
+       CONNECTING,
+       MPA_REQ_WAIT,
+       MPA_REQ_SENT,
+       MPA_REQ_RCVD,
+       MPA_REP_SENT,
+       FPDU_MODE,
+       ABORTING,
+       CLOSING,
+       MORIBUND,
+       DEAD,
+};
+
+struct iwch_ep_common {
+       struct iw_cm_id *cm_id;
+       struct iwch_qp *qp;
+       struct t3cdev *tdev;
+       enum iwch_ep_state state;
+       struct kref kref;
+       spinlock_t lock;
+       struct sockaddr_in local_addr;
+       struct sockaddr_in remote_addr;
+       wait_queue_head_t waitq;
+       int rpl_done;
+       int rpl_err;
+};
+
+struct iwch_listen_ep {
+       struct iwch_ep_common com;
+       unsigned int stid;
+       int backlog;
+};
+
+struct iwch_ep {
+       struct iwch_ep_common com;
+       struct iwch_ep *parent_ep;
+       struct timer_list timer;
+       unsigned int atid;
+       u32 hwtid;
+       u32 snd_seq;
+       struct l2t_entry *l2t;
+       struct dst_entry *dst;
+       struct sk_buff *mpa_skb;
+       struct iwch_mpa_attributes mpa_attr;
+       unsigned int mpa_pkt_len;
+       u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA];
+       u8 tos;
+       u16 emss;
+       u16 plen;
+       u32 ird;
+       u32 ord;
+};
+
+static inline struct iwch_ep *to_ep(struct iw_cm_id *cm_id)
+{
+       return cm_id->provider_data;
+}
+
+static inline struct iwch_listen_ep *to_listen_ep(struct iw_cm_id *cm_id)
+{
+       return cm_id->provider_data;
+}
+
+static inline int compute_wscale(int win)
+{
+       int wscale = 0;
+
+       while (wscale < 14 && (65535<<wscale) < win)
+               wscale++;
+       return wscale;
+}
+
+/* CM prototypes */
+
+int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int iwch_create_listen(struct iw_cm_id *cm_id, int backlog);
+int iwch_destroy_listen(struct iw_cm_id *cm_id);
+int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len);
+int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp);
+int iwch_quiesce_tid(struct iwch_ep *ep);
+int iwch_resume_tid(struct iwch_ep *ep);
+void __free_ep(struct kref *kref);
+void iwch_rearp(struct iwch_ep *ep);
+int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, struct l2t_entry *l2t);
+
+int __init iwch_cm_init(void);
+void __exit iwch_cm_term(void);
+
+#endif                         /* _IWCH_CM_H_ */
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cq.c b/drivers/infiniband/hw/cxgb3/iwch_cq.c
new file mode 100644 (file)
index 0000000..98b3bdb
--- /dev/null
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "iwch_provider.h"
+#include "iwch.h"
+
+/*
+ * Get one cq entry from cxio and map it to openib.
+ *
+ * Returns:
+ *     0                       EMPTY;
+ *     1                       cqe returned
+ *     -EAGAIN         caller must try again
+ *     any other -errno        fatal error
+ */
+static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
+                           struct ib_wc *wc)
+{
+       struct iwch_qp *qhp = NULL;
+       struct t3_cqe cqe, *rd_cqe;
+       struct t3_wq *wq;
+       u32 credit = 0;
+       u8 cqe_flushed;
+       u64 cookie;
+       int ret = 1;
+
+       rd_cqe = cxio_next_cqe(&chp->cq);
+
+       if (!rd_cqe)
+               return 0;
+
+       qhp = get_qhp(rhp, CQE_QPID(*rd_cqe));
+       if (!qhp)
+               wq = NULL;
+       else {
+               spin_lock(&qhp->lock);
+               wq = &(qhp->wq);
+       }
+       ret = cxio_poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie,
+                                  &credit);
+       if (t3a_device(chp->rhp) && credit) {
+               PDBG("%s updating %d cq credits on id %d\n", __FUNCTION__,
+                    credit, chp->cq.cqid);
+               cxio_hal_cq_op(&rhp->rdev, &chp->cq, CQ_CREDIT_UPDATE, credit);
+       }
+
+       if (ret) {
+               ret = -EAGAIN;
+               goto out;
+       }
+       ret = 1;
+
+       wc->wr_id = cookie;
+       wc->qp = &qhp->ibqp;
+       wc->vendor_err = CQE_STATUS(cqe);
+
+       PDBG("%s qpid 0x%x type %d opcode %d status 0x%x wrid hi 0x%x "
+            "lo 0x%x cookie 0x%llx\n", __FUNCTION__,
+            CQE_QPID(cqe), CQE_TYPE(cqe),
+            CQE_OPCODE(cqe), CQE_STATUS(cqe), CQE_WRID_HI(cqe),
+            CQE_WRID_LOW(cqe), (unsigned long long) cookie);
+
+       if (CQE_TYPE(cqe) == 0) {
+               if (!CQE_STATUS(cqe))
+                       wc->byte_len = CQE_LEN(cqe);
+               else
+                       wc->byte_len = 0;
+               wc->opcode = IB_WC_RECV;
+       } else {
+               switch (CQE_OPCODE(cqe)) {
+               case T3_RDMA_WRITE:
+                       wc->opcode = IB_WC_RDMA_WRITE;
+                       break;
+               case T3_READ_REQ:
+                       wc->opcode = IB_WC_RDMA_READ;
+                       wc->byte_len = CQE_LEN(cqe);
+                       break;
+               case T3_SEND:
+               case T3_SEND_WITH_SE:
+                       wc->opcode = IB_WC_SEND;
+                       break;
+               case T3_BIND_MW:
+                       wc->opcode = IB_WC_BIND_MW;
+                       break;
+
+               /* these aren't supported yet */
+               case T3_SEND_WITH_INV:
+               case T3_SEND_WITH_SE_INV:
+               case T3_LOCAL_INV:
+               case T3_FAST_REGISTER:
+               default:
+                       printk(KERN_ERR MOD "Unexpected opcode %d "
+                              "in the CQE received for QPID=0x%0x\n",
+                              CQE_OPCODE(cqe), CQE_QPID(cqe));
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+       if (cqe_flushed)
+               wc->status = IB_WC_WR_FLUSH_ERR;
+       else {
+
+               switch (CQE_STATUS(cqe)) {
+               case TPT_ERR_SUCCESS:
+                       wc->status = IB_WC_SUCCESS;
+                       break;
+               case TPT_ERR_STAG:
+                       wc->status = IB_WC_LOC_ACCESS_ERR;
+                       break;
+               case TPT_ERR_PDID:
+                       wc->status = IB_WC_LOC_PROT_ERR;
+                       break;
+               case TPT_ERR_QPID:
+               case TPT_ERR_ACCESS:
+                       wc->status = IB_WC_LOC_ACCESS_ERR;
+                       break;
+               case TPT_ERR_WRAP:
+                       wc->status = IB_WC_GENERAL_ERR;
+                       break;
+               case TPT_ERR_BOUND:
+                       wc->status = IB_WC_LOC_LEN_ERR;
+                       break;
+               case TPT_ERR_INVALIDATE_SHARED_MR:
+               case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+                       wc->status = IB_WC_MW_BIND_ERR;
+                       break;
+               case TPT_ERR_CRC:
+               case TPT_ERR_MARKER:
+               case TPT_ERR_PDU_LEN_ERR:
+               case TPT_ERR_OUT_OF_RQE:
+               case TPT_ERR_DDP_VERSION:
+               case TPT_ERR_RDMA_VERSION:
+               case TPT_ERR_DDP_QUEUE_NUM:
+               case TPT_ERR_MSN:
+               case TPT_ERR_TBIT:
+               case TPT_ERR_MO:
+               case TPT_ERR_MSN_RANGE:
+               case TPT_ERR_IRD_OVERFLOW:
+               case TPT_ERR_OPCODE:
+                       wc->status = IB_WC_FATAL_ERR;
+                       break;
+               case TPT_ERR_SWFLUSH:
+                       wc->status = IB_WC_WR_FLUSH_ERR;
+                       break;
+               default:
+                       printk(KERN_ERR MOD "Unexpected cqe_status 0x%x for "
+                              "QPID=0x%0x\n", CQE_STATUS(cqe), CQE_QPID(cqe));
+                       ret = -EINVAL;
+               }
+       }
+out:
+       if (wq)
+               spin_unlock(&qhp->lock);
+       return ret;
+}
+
+int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+       struct iwch_dev *rhp;
+       struct iwch_cq *chp;
+       unsigned long flags;
+       int npolled;
+       int err = 0;
+
+       chp = to_iwch_cq(ibcq);
+       rhp = chp->rhp;
+
+       spin_lock_irqsave(&chp->lock, flags);
+       for (npolled = 0; npolled < num_entries; ++npolled) {
+#ifdef DEBUG
+               int i=0;
+#endif
+
+               /*
+                * Because T3 can post CQEs that are _not_ associated
+                * with a WR, we might have to poll again after removing
+                * one of these.
+                */
+               do {
+                       err = iwch_poll_cq_one(rhp, chp, wc + npolled);
+#ifdef DEBUG
+                       BUG_ON(++i > 1000);
+#endif
+               } while (err == -EAGAIN);
+               if (err <= 0)
+                       break;
+       }
+       spin_unlock_irqrestore(&chp->lock, flags);
+
+       if (err < 0)
+               return err;
+       else {
+               return npolled;
+       }
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_ev.c b/drivers/infiniband/hw/cxgb3/iwch_ev.c
new file mode 100644 (file)
index 0000000..a6efa8f
--- /dev/null
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+#include <linux/mman.h>
+#include <net/sock.h>
+#include "iwch_provider.h"
+#include "iwch.h"
+#include "iwch_cm.h"
+#include "cxio_hal.h"
+#include "cxio_wr.h"
+
+static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp,
+                         struct respQ_msg_t *rsp_msg,
+                         enum ib_event_type ib_event,
+                         int send_term)
+{
+       struct ib_event event;
+       struct iwch_qp_attributes attrs;
+       struct iwch_qp *qhp;
+
+       printk(KERN_ERR "%s - AE qpid 0x%x opcode %d status 0x%x "
+              "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __FUNCTION__,
+              CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe),
+              CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe),
+              CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
+
+       spin_lock(&rnicp->lock);
+       qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe));
+
+       if (!qhp) {
+               printk(KERN_ERR "%s unaffiliated error 0x%x qpid 0x%x\n",
+                      __FUNCTION__, CQE_STATUS(rsp_msg->cqe),
+                      CQE_QPID(rsp_msg->cqe));
+               spin_unlock(&rnicp->lock);
+               return;
+       }
+
+       if ((qhp->attr.state == IWCH_QP_STATE_ERROR) ||
+           (qhp->attr.state == IWCH_QP_STATE_TERMINATE)) {
+               PDBG("%s AE received after RTS - "
+                    "qp state %d qpid 0x%x status 0x%x\n", __FUNCTION__,
+                    qhp->attr.state, qhp->wq.qpid, CQE_STATUS(rsp_msg->cqe));
+               spin_unlock(&rnicp->lock);
+               return;
+       }
+
+       atomic_inc(&qhp->refcnt);
+       spin_unlock(&rnicp->lock);
+
+       event.event = ib_event;
+       event.device = chp->ibcq.device;
+       if (ib_event == IB_EVENT_CQ_ERR)
+               event.element.cq = &chp->ibcq;
+       else
+               event.element.qp = &qhp->ibqp;
+
+       if (qhp->ibqp.event_handler)
+               (*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context);
+
+       if (qhp->attr.state == IWCH_QP_STATE_RTS) {
+               attrs.next_state = IWCH_QP_STATE_TERMINATE;
+               iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE,
+                              &attrs, 1);
+               if (send_term)
+                       iwch_post_terminate(qhp, rsp_msg);
+       }
+
+       if (atomic_dec_and_test(&qhp->refcnt))
+               wake_up(&qhp->wait);
+}
+
+void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb)
+{
+       struct iwch_dev *rnicp;
+       struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data;
+       struct iwch_cq *chp;
+       struct iwch_qp *qhp;
+       u32 cqid = RSPQ_CQID(rsp_msg);
+
+       rnicp = (struct iwch_dev *) rdev_p->ulp;
+       spin_lock(&rnicp->lock);
+       chp = get_chp(rnicp, cqid);
+       qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe));
+       if (!chp || !qhp) {
+               printk(KERN_ERR MOD "BAD AE cqid 0x%x qpid 0x%x opcode %d "
+                      "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x \n",
+                      cqid, CQE_QPID(rsp_msg->cqe),
+                      CQE_OPCODE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
+                      CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe),
+                      CQE_WRID_LOW(rsp_msg->cqe));
+               spin_unlock(&rnicp->lock);
+               goto out;
+       }
+       iwch_qp_add_ref(&qhp->ibqp);
+       atomic_inc(&chp->refcnt);
+       spin_unlock(&rnicp->lock);
+
+       /*
+        * 1) completion of our sending a TERMINATE.
+        * 2) incoming TERMINATE message.
+        */
+       if ((CQE_OPCODE(rsp_msg->cqe) == T3_TERMINATE) &&
+           (CQE_STATUS(rsp_msg->cqe) == 0)) {
+               if (SQ_TYPE(rsp_msg->cqe)) {
+                       PDBG("%s QPID 0x%x ep %p disconnecting\n",
+                            __FUNCTION__, qhp->wq.qpid, qhp->ep);
+                       iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC);
+               } else {
+                       PDBG("%s post REQ_ERR AE QPID 0x%x\n", __FUNCTION__,
+                            qhp->wq.qpid);
+                       post_qp_event(rnicp, chp, rsp_msg,
+                                     IB_EVENT_QP_REQ_ERR, 0);
+                       iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC);
+               }
+               goto done;
+       }
+
+       /* Bad incoming Read request */
+       if (SQ_TYPE(rsp_msg->cqe) &&
+           (CQE_OPCODE(rsp_msg->cqe) == T3_READ_RESP)) {
+               post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1);
+               goto done;
+       }
+
+       /* Bad incoming write */
+       if (RQ_TYPE(rsp_msg->cqe) &&
+           (CQE_OPCODE(rsp_msg->cqe) == T3_RDMA_WRITE)) {
+               post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1);
+               goto done;
+       }
+
+       switch (CQE_STATUS(rsp_msg->cqe)) {
+
+       /* Completion Events */
+       case TPT_ERR_SUCCESS:
+
+               /*
+                * Confirm the destination entry if this is a RECV completion.
+                */
+               if (qhp->ep && SQ_TYPE(rsp_msg->cqe))
+                       dst_confirm(qhp->ep->dst);
+               (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+               break;
+
+       case TPT_ERR_STAG:
+       case TPT_ERR_PDID:
+       case TPT_ERR_QPID:
+       case TPT_ERR_ACCESS:
+       case TPT_ERR_WRAP:
+       case TPT_ERR_BOUND:
+       case TPT_ERR_INVALIDATE_SHARED_MR:
+       case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+               printk(KERN_ERR "%s - CQE Err qpid 0x%x opcode %d status 0x%x "
+                      "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __FUNCTION__,
+                      CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe),
+                      CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe),
+                      CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
+               (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+               post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_ACCESS_ERR, 1);
+               break;
+
+       /* Device Fatal Errors */
+       case TPT_ERR_ECC:
+       case TPT_ERR_ECC_PSTAG:
+       case TPT_ERR_INTERNAL_ERR:
+               post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_DEVICE_FATAL, 1);
+               break;
+
+       /* QP Fatal Errors */
+       case TPT_ERR_OUT_OF_RQE:
+       case TPT_ERR_PBL_ADDR_BOUND:
+       case TPT_ERR_CRC:
+       case TPT_ERR_MARKER:
+       case TPT_ERR_PDU_LEN_ERR:
+       case TPT_ERR_DDP_VERSION:
+       case TPT_ERR_RDMA_VERSION:
+       case TPT_ERR_OPCODE:
+       case TPT_ERR_DDP_QUEUE_NUM:
+       case TPT_ERR_MSN:
+       case TPT_ERR_TBIT:
+       case TPT_ERR_MO:
+       case TPT_ERR_MSN_GAP:
+       case TPT_ERR_MSN_RANGE:
+       case TPT_ERR_RQE_ADDR_BOUND:
+       case TPT_ERR_IRD_OVERFLOW:
+               post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1);
+               break;
+
+       default:
+               printk(KERN_ERR MOD "Unknown T3 status 0x%x QPID 0x%x\n",
+                      CQE_STATUS(rsp_msg->cqe), qhp->wq.qpid);
+               post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1);
+               break;
+       }
+done:
+       if (atomic_dec_and_test(&chp->refcnt))
+               wake_up(&chp->wait);
+       iwch_qp_rem_ref(&qhp->ibqp);
+out:
+       dev_kfree_skb_irq(skb);
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c
new file mode 100644 (file)
index 0000000..2b6cd53
--- /dev/null
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <asm/byteorder.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+
+#include "cxio_hal.h"
+#include "iwch.h"
+#include "iwch_provider.h"
+
+int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+                                       struct iwch_mr *mhp,
+                                       int shift,
+                                       __be64 *page_list)
+{
+       u32 stag;
+       u32 mmid;
+
+
+       if (cxio_register_phys_mem(&rhp->rdev,
+                                  &stag, mhp->attr.pdid,
+                                  mhp->attr.perms,
+                                  mhp->attr.zbva,
+                                  mhp->attr.va_fbo,
+                                  mhp->attr.len,
+                                  shift-12,
+                                  page_list,
+                                  &mhp->attr.pbl_size, &mhp->attr.pbl_addr))
+               return -ENOMEM;
+       mhp->attr.state = 1;
+       mhp->attr.stag = stag;
+       mmid = stag >> 8;
+       mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+       insert_handle(rhp, &rhp->mmidr, mhp, mmid);
+       PDBG("%s mmid 0x%x mhp %p\n", __FUNCTION__, mmid, mhp);
+       return 0;
+}
+
+int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+                                       struct iwch_mr *mhp,
+                                       int shift,
+                                       __be64 *page_list,
+                                       int npages)
+{
+       u32 stag;
+       u32 mmid;
+
+
+       /* We could support this... */
+       if (npages > mhp->attr.pbl_size)
+               return -ENOMEM;
+
+       stag = mhp->attr.stag;
+       if (cxio_reregister_phys_mem(&rhp->rdev,
+                                  &stag, mhp->attr.pdid,
+                                  mhp->attr.perms,
+                                  mhp->attr.zbva,
+                                  mhp->attr.va_fbo,
+                                  mhp->attr.len,
+                                  shift-12,
+                                  page_list,
+                                  &mhp->attr.pbl_size, &mhp->attr.pbl_addr))
+               return -ENOMEM;
+       mhp->attr.state = 1;
+       mhp->attr.stag = stag;
+       mmid = stag >> 8;
+       mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+       insert_handle(rhp, &rhp->mmidr, mhp, mmid);
+       PDBG("%s mmid 0x%x mhp %p\n", __FUNCTION__, mmid, mhp);
+       return 0;
+}
+
+int build_phys_page_list(struct ib_phys_buf *buffer_list,
+                                       int num_phys_buf,
+                                       u64 *iova_start,
+                                       u64 *total_size,
+                                       int *npages,
+                                       int *shift,
+                                       __be64 **page_list)
+{
+       u64 mask;
+       int i, j, n;
+
+       mask = 0;
+       *total_size = 0;
+       for (i = 0; i < num_phys_buf; ++i) {
+               if (i != 0 && buffer_list[i].addr & ~PAGE_MASK)
+                       return -EINVAL;
+               if (i != 0 && i != num_phys_buf - 1 &&
+                   (buffer_list[i].size & ~PAGE_MASK))
+                       return -EINVAL;
+               *total_size += buffer_list[i].size;
+               if (i > 0)
+                       mask |= buffer_list[i].addr;
+       }
+
+       if (*total_size > 0xFFFFFFFFULL)
+               return -ENOMEM;
+
+       /* Find largest page shift we can use to cover buffers */
+       for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift))
+               if (num_phys_buf > 1) {
+                       if ((1ULL << *shift) & mask)
+                               break;
+               } else
+                       if (1ULL << *shift >=
+                           buffer_list[0].size +
+                           (buffer_list[0].addr & ((1ULL << *shift) - 1)))
+                               break;
+
+       buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1);
+       buffer_list[0].addr &= ~0ull << *shift;
+
+       *npages = 0;
+       for (i = 0; i < num_phys_buf; ++i)
+               *npages += (buffer_list[i].size +
+                       (1ULL << *shift) - 1) >> *shift;
+
+       if (!*npages)
+               return -EINVAL;
+
+       *page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL);
+       if (!*page_list)
+               return -ENOMEM;
+
+       n = 0;
+       for (i = 0; i < num_phys_buf; ++i)
+               for (j = 0;
+                    j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift;
+                    ++j)
+                       (*page_list)[n++] = cpu_to_be64(buffer_list[i].addr +
+                           ((u64) j << *shift));
+
+       PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n",
+            __FUNCTION__, (unsigned long long) *iova_start,
+            (unsigned long long) mask, *shift, (unsigned long long) *total_size,
+            *npages);
+
+       return 0;
+
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
new file mode 100644 (file)
index 0000000..6861087
--- /dev/null
@@ -0,0 +1,1203 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/ethtool.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "cxio_hal.h"
+#include "iwch.h"
+#include "iwch_provider.h"
+#include "iwch_cm.h"
+#include "iwch_user.h"
+
+static int iwch_modify_port(struct ib_device *ibdev,
+                           u8 port, int port_modify_mask,
+                           struct ib_port_modify *props)
+{
+       return -ENOSYS;
+}
+
+static struct ib_ah *iwch_ah_create(struct ib_pd *pd,
+                                   struct ib_ah_attr *ah_attr)
+{
+       return ERR_PTR(-ENOSYS);
+}
+
+static int iwch_ah_destroy(struct ib_ah *ah)
+{
+       return -ENOSYS;
+}
+
+static int iwch_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       return -ENOSYS;
+}
+
+static int iwch_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       return -ENOSYS;
+}
+
+static int iwch_process_mad(struct ib_device *ibdev,
+                           int mad_flags,
+                           u8 port_num,
+                           struct ib_wc *in_wc,
+                           struct ib_grh *in_grh,
+                           struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+       return -ENOSYS;
+}
+
+static int iwch_dealloc_ucontext(struct ib_ucontext *context)
+{
+       struct iwch_dev *rhp = to_iwch_dev(context->device);
+       struct iwch_ucontext *ucontext = to_iwch_ucontext(context);
+       struct iwch_mm_entry *mm, *tmp;
+
+       PDBG("%s context %p\n", __FUNCTION__, context);
+       list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
+               kfree(mm);
+       cxio_release_ucontext(&rhp->rdev, &ucontext->uctx);
+       kfree(ucontext);
+       return 0;
+}
+
+static struct ib_ucontext *iwch_alloc_ucontext(struct ib_device *ibdev,
+                                       struct ib_udata *udata)
+{
+       struct iwch_ucontext *context;
+       struct iwch_dev *rhp = to_iwch_dev(ibdev);
+
+       PDBG("%s ibdev %p\n", __FUNCTION__, ibdev);
+       context = kzalloc(sizeof(*context), GFP_KERNEL);
+       if (!context)
+               return ERR_PTR(-ENOMEM);
+       cxio_init_ucontext(&rhp->rdev, &context->uctx);
+       INIT_LIST_HEAD(&context->mmaps);
+       spin_lock_init(&context->mmap_lock);
+       return &context->ibucontext;
+}
+
+static int iwch_destroy_cq(struct ib_cq *ib_cq)
+{
+       struct iwch_cq *chp;
+
+       PDBG("%s ib_cq %p\n", __FUNCTION__, ib_cq);
+       chp = to_iwch_cq(ib_cq);
+
+       remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid);
+       atomic_dec(&chp->refcnt);
+       wait_event(chp->wait, !atomic_read(&chp->refcnt));
+
+       cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
+       kfree(chp);
+       return 0;
+}
+
+static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, int entries,
+                            struct ib_ucontext *ib_context,
+                            struct ib_udata *udata)
+{
+       struct iwch_dev *rhp;
+       struct iwch_cq *chp;
+       struct iwch_create_cq_resp uresp;
+       struct iwch_create_cq_req ureq;
+       struct iwch_ucontext *ucontext = NULL;
+
+       PDBG("%s ib_dev %p entries %d\n", __FUNCTION__, ibdev, entries);
+       rhp = to_iwch_dev(ibdev);
+       chp = kzalloc(sizeof(*chp), GFP_KERNEL);
+       if (!chp)
+               return ERR_PTR(-ENOMEM);
+
+       if (ib_context) {
+               ucontext = to_iwch_ucontext(ib_context);
+               if (!t3a_device(rhp)) {
+                       if (ib_copy_from_udata(&ureq, udata, sizeof (ureq))) {
+                               kfree(chp);
+                               return ERR_PTR(-EFAULT);
+                       }
+                       chp->user_rptr_addr = (u32 __user *)(unsigned long)ureq.user_rptr_addr;
+               }
+       }
+
+       if (t3a_device(rhp)) {
+
+               /*
+                * T3A: Add some fluff to handle extra CQEs inserted
+                * for various errors.
+                * Additional CQE possibilities:
+                *      TERMINATE,
+                *      incoming RDMA WRITE Failures
+                *      incoming RDMA READ REQUEST FAILUREs
+                * NOTE: We cannot ensure the CQ won't overflow.
+                */
+               entries += 16;
+       }
+       entries = roundup_pow_of_two(entries);
+       chp->cq.size_log2 = ilog2(entries);
+
+       if (cxio_create_cq(&rhp->rdev, &chp->cq)) {
+               kfree(chp);
+               return ERR_PTR(-ENOMEM);
+       }
+       chp->rhp = rhp;
+       chp->ibcq.cqe = (1 << chp->cq.size_log2) - 1;
+       spin_lock_init(&chp->lock);
+       atomic_set(&chp->refcnt, 1);
+       init_waitqueue_head(&chp->wait);
+       insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid);
+
+       if (ucontext) {
+               struct iwch_mm_entry *mm;
+
+               mm = kmalloc(sizeof *mm, GFP_KERNEL);
+               if (!mm) {
+                       iwch_destroy_cq(&chp->ibcq);
+                       return ERR_PTR(-ENOMEM);
+               }
+               uresp.cqid = chp->cq.cqid;
+               uresp.size_log2 = chp->cq.size_log2;
+               spin_lock(&ucontext->mmap_lock);
+               uresp.key = ucontext->key;
+               ucontext->key += PAGE_SIZE;
+               spin_unlock(&ucontext->mmap_lock);
+               if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+                       kfree(mm);
+                       iwch_destroy_cq(&chp->ibcq);
+                       return ERR_PTR(-EFAULT);
+               }
+               mm->key = uresp.key;
+               mm->addr = virt_to_phys(chp->cq.queue);
+               mm->len = PAGE_ALIGN((1UL << uresp.size_log2) *
+                                            sizeof (struct t3_cqe));
+               insert_mmap(ucontext, mm);
+       }
+       PDBG("created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx\n",
+            chp->cq.cqid, chp, (1 << chp->cq.size_log2),
+            (unsigned long long) chp->cq.dma_addr);
+       return &chp->ibcq;
+}
+
+static int iwch_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
+{
+#ifdef notyet
+       struct iwch_cq *chp = to_iwch_cq(cq);
+       struct t3_cq oldcq, newcq;
+       int ret;
+
+       PDBG("%s ib_cq %p cqe %d\n", __FUNCTION__, cq, cqe);
+
+       /* We don't downsize... */
+       if (cqe <= cq->cqe)
+               return 0;
+
+       /* create new t3_cq with new size */
+       cqe = roundup_pow_of_two(cqe+1);
+       newcq.size_log2 = ilog2(cqe);
+
+       /* Dont allow resize to less than the current wce count */
+       if (cqe < Q_COUNT(chp->cq.rptr, chp->cq.wptr)) {
+               return -ENOMEM;
+       }
+
+       /* Quiesce all QPs using this CQ */
+       ret = iwch_quiesce_qps(chp);
+       if (ret) {
+               return ret;
+       }
+
+       ret = cxio_create_cq(&chp->rhp->rdev, &newcq);
+       if (ret) {
+               return ret;
+       }
+
+       /* copy CQEs */
+       memcpy(newcq.queue, chp->cq.queue, (1 << chp->cq.size_log2) *
+                                       sizeof(struct t3_cqe));
+
+       /* old iwch_qp gets new t3_cq but keeps old cqid */
+       oldcq = chp->cq;
+       chp->cq = newcq;
+       chp->cq.cqid = oldcq.cqid;
+
+       /* resize new t3_cq to update the HW context */
+       ret = cxio_resize_cq(&chp->rhp->rdev, &chp->cq);
+       if (ret) {
+               chp->cq = oldcq;
+               return ret;
+       }
+       chp->ibcq.cqe = (1<<chp->cq.size_log2) - 1;
+
+       /* destroy old t3_cq */
+       oldcq.cqid = newcq.cqid;
+       ret = cxio_destroy_cq(&chp->rhp->rdev, &oldcq);
+       if (ret) {
+               printk(KERN_ERR MOD "%s - cxio_destroy_cq failed %d\n",
+                       __FUNCTION__, ret);
+       }
+
+       /* add user hooks here */
+
+       /* resume qps */
+       ret = iwch_resume_qps(chp);
+       return ret;
+#else
+       return -ENOSYS;
+#endif
+}
+
+static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify notify)
+{
+       struct iwch_dev *rhp;
+       struct iwch_cq *chp;
+       enum t3_cq_opcode cq_op;
+       int err;
+       unsigned long flag;
+       u32 rptr;
+
+       chp = to_iwch_cq(ibcq);
+       rhp = chp->rhp;
+       if (notify == IB_CQ_SOLICITED)
+               cq_op = CQ_ARM_SE;
+       else
+               cq_op = CQ_ARM_AN;
+       if (chp->user_rptr_addr) {
+               if (get_user(rptr, chp->user_rptr_addr))
+                       return -EFAULT;
+               spin_lock_irqsave(&chp->lock, flag);
+               chp->cq.rptr = rptr;
+       } else
+               spin_lock_irqsave(&chp->lock, flag);
+       PDBG("%s rptr 0x%x\n", __FUNCTION__, chp->cq.rptr);
+       err = cxio_hal_cq_op(&rhp->rdev, &chp->cq, cq_op, 0);
+       spin_unlock_irqrestore(&chp->lock, flag);
+       if (err)
+               printk(KERN_ERR MOD "Error %d rearming CQID 0x%x\n", err,
+                      chp->cq.cqid);
+       return err;
+}
+
+static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+       int len = vma->vm_end - vma->vm_start;
+       u32 key = vma->vm_pgoff << PAGE_SHIFT;
+       struct cxio_rdev *rdev_p;
+       int ret = 0;
+       struct iwch_mm_entry *mm;
+       struct iwch_ucontext *ucontext;
+
+       PDBG("%s pgoff 0x%lx key 0x%x len %d\n", __FUNCTION__, vma->vm_pgoff,
+            key, len);
+
+       if (vma->vm_start & (PAGE_SIZE-1)) {
+               return -EINVAL;
+       }
+
+       rdev_p = &(to_iwch_dev(context->device)->rdev);
+       ucontext = to_iwch_ucontext(context);
+
+       mm = remove_mmap(ucontext, key, len);
+       if (!mm)
+               return -EINVAL;
+       kfree(mm);
+
+       if ((mm->addr >= rdev_p->rnic_info.udbell_physbase) &&
+           (mm->addr < (rdev_p->rnic_info.udbell_physbase +
+                      rdev_p->rnic_info.udbell_len))) {
+
+               /*
+                * Map T3 DB register.
+                */
+               if (vma->vm_flags & VM_READ) {
+                       return -EPERM;
+               }
+
+               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+               vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+               vma->vm_flags &= ~VM_MAYREAD;
+               ret = io_remap_pfn_range(vma, vma->vm_start,
+                                        mm->addr >> PAGE_SHIFT,
+                                        len, vma->vm_page_prot);
+       } else {
+
+               /*
+                * Map WQ or CQ contig dma memory...
+                */
+               ret = remap_pfn_range(vma, vma->vm_start,
+                                     mm->addr >> PAGE_SHIFT,
+                                     len, vma->vm_page_prot);
+       }
+
+       return ret;
+}
+
+static int iwch_deallocate_pd(struct ib_pd *pd)
+{
+       struct iwch_dev *rhp;
+       struct iwch_pd *php;
+
+       php = to_iwch_pd(pd);
+       rhp = php->rhp;
+       PDBG("%s ibpd %p pdid 0x%x\n", __FUNCTION__, pd, php->pdid);
+       cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid);
+       kfree(php);
+       return 0;
+}
+
+static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev,
+                              struct ib_ucontext *context,
+                              struct ib_udata *udata)
+{
+       struct iwch_pd *php;
+       u32 pdid;
+       struct iwch_dev *rhp;
+
+       PDBG("%s ibdev %p\n", __FUNCTION__, ibdev);
+       rhp = (struct iwch_dev *) ibdev;
+       pdid = cxio_hal_get_pdid(rhp->rdev.rscp);
+       if (!pdid)
+               return ERR_PTR(-EINVAL);
+       php = kzalloc(sizeof(*php), GFP_KERNEL);
+       if (!php) {
+               cxio_hal_put_pdid(rhp->rdev.rscp, pdid);
+               return ERR_PTR(-ENOMEM);
+       }
+       php->pdid = pdid;
+       php->rhp = rhp;
+       if (context) {
+               if (ib_copy_to_udata(udata, &php->pdid, sizeof (__u32))) {
+                       iwch_deallocate_pd(&php->ibpd);
+                       return ERR_PTR(-EFAULT);
+               }
+       }
+       PDBG("%s pdid 0x%0x ptr 0x%p\n", __FUNCTION__, pdid, php);
+       return &php->ibpd;
+}
+
+static int iwch_dereg_mr(struct ib_mr *ib_mr)
+{
+       struct iwch_dev *rhp;
+       struct iwch_mr *mhp;
+       u32 mmid;
+
+       PDBG("%s ib_mr %p\n", __FUNCTION__, ib_mr);
+       /* There can be no memory windows */
+       if (atomic_read(&ib_mr->usecnt))
+               return -EINVAL;
+
+       mhp = to_iwch_mr(ib_mr);
+       rhp = mhp->rhp;
+       mmid = mhp->attr.stag >> 8;
+       cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+                      mhp->attr.pbl_addr);
+       remove_handle(rhp, &rhp->mmidr, mmid);
+       if (mhp->kva)
+               kfree((void *) (unsigned long) mhp->kva);
+       PDBG("%s mmid 0x%x ptr %p\n", __FUNCTION__, mmid, mhp);
+       kfree(mhp);
+       return 0;
+}
+
+static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
+                                       struct ib_phys_buf *buffer_list,
+                                       int num_phys_buf,
+                                       int acc,
+                                       u64 *iova_start)
+{
+       __be64 *page_list;
+       int shift;
+       u64 total_size;
+       int npages;
+       struct iwch_dev *rhp;
+       struct iwch_pd *php;
+       struct iwch_mr *mhp;
+       int ret;
+
+       PDBG("%s ib_pd %p\n", __FUNCTION__, pd);
+       php = to_iwch_pd(pd);
+       rhp = php->rhp;
+
+       acc = iwch_convert_access(acc);
+
+
+       mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+       if (!mhp)
+               return ERR_PTR(-ENOMEM);
+
+       /* First check that we have enough alignment */
+       if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       if (num_phys_buf > 1 &&
+           ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) {
+               ret = -EINVAL;
+               goto err;
+       }
+
+       ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start,
+                                  &total_size, &npages, &shift, &page_list);
+       if (ret)
+               goto err;
+
+       mhp->rhp = rhp;
+       mhp->attr.pdid = php->pdid;
+       mhp->attr.zbva = 0;
+
+       /* NOTE: TPT perms are backwards from BIND WR perms! */
+       mhp->attr.perms = (acc & 0x1) << 3;
+       mhp->attr.perms |= (acc & 0x2) << 1;
+       mhp->attr.perms |= (acc & 0x4) >> 1;
+       mhp->attr.perms |= (acc & 0x8) >> 3;
+
+       mhp->attr.va_fbo = *iova_start;
+       mhp->attr.page_size = shift - 12;
+
+       mhp->attr.len = (u32) total_size;
+       mhp->attr.pbl_size = npages;
+       ret = iwch_register_mem(rhp, php, mhp, shift, page_list);
+       kfree(page_list);
+       if (ret) {
+               goto err;
+       }
+       return &mhp->ibmr;
+err:
+       kfree(mhp);
+       return ERR_PTR(ret);
+
+}
+
+static int iwch_reregister_phys_mem(struct ib_mr *mr,
+                                    int mr_rereg_mask,
+                                    struct ib_pd *pd,
+                                    struct ib_phys_buf *buffer_list,
+                                    int num_phys_buf,
+                                    int acc, u64 * iova_start)
+{
+
+       struct iwch_mr mh, *mhp;
+       struct iwch_pd *php;
+       struct iwch_dev *rhp;
+       int new_acc;
+       __be64 *page_list = NULL;
+       int shift = 0;
+       u64 total_size;
+       int npages;
+       int ret;
+
+       PDBG("%s ib_mr %p ib_pd %p\n", __FUNCTION__, mr, pd);
+
+       /* There can be no memory windows */
+       if (atomic_read(&mr->usecnt))
+               return -EINVAL;
+
+       mhp = to_iwch_mr(mr);
+       rhp = mhp->rhp;
+       php = to_iwch_pd(mr->pd);
+
+       /* make sure we are on the same adapter */
+       if (rhp != php->rhp)
+               return -EINVAL;
+
+       new_acc = mhp->attr.perms;
+
+       memcpy(&mh, mhp, sizeof *mhp);
+
+       if (mr_rereg_mask & IB_MR_REREG_PD)
+               php = to_iwch_pd(pd);
+       if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+               mh.attr.perms = iwch_convert_access(acc);
+       if (mr_rereg_mask & IB_MR_REREG_TRANS)
+               ret = build_phys_page_list(buffer_list, num_phys_buf,
+                                          iova_start,
+                                          &total_size, &npages,
+                                          &shift, &page_list);
+
+       ret = iwch_reregister_mem(rhp, php, &mh, shift, page_list, npages);
+       kfree(page_list);
+       if (ret) {
+               return ret;
+       }
+       if (mr_rereg_mask & IB_MR_REREG_PD)
+               mhp->attr.pdid = php->pdid;
+       if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+               mhp->attr.perms = acc;
+       if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+               mhp->attr.zbva = 0;
+               mhp->attr.va_fbo = *iova_start;
+               mhp->attr.page_size = shift - 12;
+               mhp->attr.len = (u32) total_size;
+               mhp->attr.pbl_size = npages;
+       }
+
+       return 0;
+}
+
+
+static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
+                                     int acc, struct ib_udata *udata)
+{
+       __be64 *pages;
+       int shift, n, len;
+       int i, j, k;
+       int err = 0;
+       struct ib_umem_chunk *chunk;
+       struct iwch_dev *rhp;
+       struct iwch_pd *php;
+       struct iwch_mr *mhp;
+       struct iwch_reg_user_mr_resp uresp;
+
+       PDBG("%s ib_pd %p\n", __FUNCTION__, pd);
+       shift = ffs(region->page_size) - 1;
+
+       php = to_iwch_pd(pd);
+       rhp = php->rhp;
+       mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+       if (!mhp)
+               return ERR_PTR(-ENOMEM);
+
+       n = 0;
+       list_for_each_entry(chunk, &region->chunk_list, list)
+               n += chunk->nents;
+
+       pages = kmalloc(n * sizeof(u64), GFP_KERNEL);
+       if (!pages) {
+               err = -ENOMEM;
+               goto err;
+       }
+
+       acc = iwch_convert_access(acc);
+
+       i = n = 0;
+
+       list_for_each_entry(chunk, &region->chunk_list, list)
+               for (j = 0; j < chunk->nmap; ++j) {
+                       len = sg_dma_len(&chunk->page_list[j]) >> shift;
+                       for (k = 0; k < len; ++k) {
+                               pages[i++] = cpu_to_be64(sg_dma_address(
+                                       &chunk->page_list[j]) +
+                                       region->page_size * k);
+                       }
+               }
+
+       mhp->rhp = rhp;
+       mhp->attr.pdid = php->pdid;
+       mhp->attr.zbva = 0;
+       mhp->attr.perms = (acc & 0x1) << 3;
+       mhp->attr.perms |= (acc & 0x2) << 1;
+       mhp->attr.perms |= (acc & 0x4) >> 1;
+       mhp->attr.perms |= (acc & 0x8) >> 3;
+       mhp->attr.va_fbo = region->virt_base;
+       mhp->attr.page_size = shift - 12;
+       mhp->attr.len = (u32) region->length;
+       mhp->attr.pbl_size = i;
+       err = iwch_register_mem(rhp, php, mhp, shift, pages);
+       kfree(pages);
+       if (err)
+               goto err;
+
+       if (udata && t3b_device(rhp)) {
+               uresp.pbl_addr = (mhp->attr.pbl_addr -
+                                rhp->rdev.rnic_info.pbl_base) >> 3;
+               PDBG("%s user resp pbl_addr 0x%x\n", __FUNCTION__,
+                    uresp.pbl_addr);
+
+               if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+                       iwch_dereg_mr(&mhp->ibmr);
+                       err = -EFAULT;
+                       goto err;
+               }
+       }
+
+       return &mhp->ibmr;
+
+err:
+       kfree(mhp);
+       return ERR_PTR(err);
+}
+
+static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
+{
+       struct ib_phys_buf bl;
+       u64 kva;
+       struct ib_mr *ibmr;
+
+       PDBG("%s ib_pd %p\n", __FUNCTION__, pd);
+
+       /*
+        * T3 only supports 32 bits of size.
+        */
+       bl.size = 0xffffffff;
+       bl.addr = 0;
+       kva = 0;
+       ibmr = iwch_register_phys_mem(pd, &bl, 1, acc, &kva);
+       return ibmr;
+}
+
+static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd)
+{
+       struct iwch_dev *rhp;
+       struct iwch_pd *php;
+       struct iwch_mw *mhp;
+       u32 mmid;
+       u32 stag = 0;
+       int ret;
+
+       php = to_iwch_pd(pd);
+       rhp = php->rhp;
+       mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+       if (!mhp)
+               return ERR_PTR(-ENOMEM);
+       ret = cxio_allocate_window(&rhp->rdev, &stag, php->pdid);
+       if (ret) {
+               kfree(mhp);
+               return ERR_PTR(ret);
+       }
+       mhp->rhp = rhp;
+       mhp->attr.pdid = php->pdid;
+       mhp->attr.type = TPT_MW;
+       mhp->attr.stag = stag;
+       mmid = (stag) >> 8;
+       insert_handle(rhp, &rhp->mmidr, mhp, mmid);
+       PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __FUNCTION__, mmid, mhp, stag);
+       return &(mhp->ibmw);
+}
+
+static int iwch_dealloc_mw(struct ib_mw *mw)
+{
+       struct iwch_dev *rhp;
+       struct iwch_mw *mhp;
+       u32 mmid;
+
+       mhp = to_iwch_mw(mw);
+       rhp = mhp->rhp;
+       mmid = (mw->rkey) >> 8;
+       cxio_deallocate_window(&rhp->rdev, mhp->attr.stag);
+       remove_handle(rhp, &rhp->mmidr, mmid);
+       kfree(mhp);
+       PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __FUNCTION__, mw, mmid, mhp);
+       return 0;
+}
+
+static int iwch_destroy_qp(struct ib_qp *ib_qp)
+{
+       struct iwch_dev *rhp;
+       struct iwch_qp *qhp;
+       struct iwch_qp_attributes attrs;
+       struct iwch_ucontext *ucontext;
+
+       qhp = to_iwch_qp(ib_qp);
+       rhp = qhp->rhp;
+
+       if (qhp->attr.state == IWCH_QP_STATE_RTS) {
+               attrs.next_state = IWCH_QP_STATE_ERROR;
+               iwch_modify_qp(rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 0);
+       }
+       wait_event(qhp->wait, !qhp->ep);
+
+       remove_handle(rhp, &rhp->qpidr, qhp->wq.qpid);
+
+       atomic_dec(&qhp->refcnt);
+       wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
+
+       ucontext = ib_qp->uobject ? to_iwch_ucontext(ib_qp->uobject->context)
+                                 : NULL;
+       cxio_destroy_qp(&rhp->rdev, &qhp->wq,
+                       ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+
+       PDBG("%s ib_qp %p qpid 0x%0x qhp %p\n", __FUNCTION__,
+            ib_qp, qhp->wq.qpid, qhp);
+       kfree(qhp);
+       return 0;
+}
+
+static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
+                            struct ib_qp_init_attr *attrs,
+                            struct ib_udata *udata)
+{
+       struct iwch_dev *rhp;
+       struct iwch_qp *qhp;
+       struct iwch_pd *php;
+       struct iwch_cq *schp;
+       struct iwch_cq *rchp;
+       struct iwch_create_qp_resp uresp;
+       int wqsize, sqsize, rqsize;
+       struct iwch_ucontext *ucontext;
+
+       PDBG("%s ib_pd %p\n", __FUNCTION__, pd);
+       if (attrs->qp_type != IB_QPT_RC)
+               return ERR_PTR(-EINVAL);
+       php = to_iwch_pd(pd);
+       rhp = php->rhp;
+       schp = get_chp(rhp, ((struct iwch_cq *) attrs->send_cq)->cq.cqid);
+       rchp = get_chp(rhp, ((struct iwch_cq *) attrs->recv_cq)->cq.cqid);
+       if (!schp || !rchp)
+               return ERR_PTR(-EINVAL);
+
+       /* The RQT size must be # of entries + 1 rounded up to a power of two */
+       rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr);
+       if (rqsize == attrs->cap.max_recv_wr)
+               rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr+1);
+
+       /* T3 doesn't support RQT depth < 16 */
+       if (rqsize < 16)
+               rqsize = 16;
+
+       if (rqsize > T3_MAX_RQ_SIZE)
+               return ERR_PTR(-EINVAL);
+
+       /*
+        * NOTE: The SQ and total WQ sizes don't need to be
+        * a power of two.  However, all the code assumes
+        * they are. EG: Q_FREECNT() and friends.
+        */
+       sqsize = roundup_pow_of_two(attrs->cap.max_send_wr);
+       wqsize = roundup_pow_of_two(rqsize + sqsize);
+       PDBG("%s wqsize %d sqsize %d rqsize %d\n", __FUNCTION__,
+            wqsize, sqsize, rqsize);
+       qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
+       if (!qhp)
+               return ERR_PTR(-ENOMEM);
+       qhp->wq.size_log2 = ilog2(wqsize);
+       qhp->wq.rq_size_log2 = ilog2(rqsize);
+       qhp->wq.sq_size_log2 = ilog2(sqsize);
+       ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL;
+       if (cxio_create_qp(&rhp->rdev, !udata, &qhp->wq,
+                          ucontext ? &ucontext->uctx : &rhp->rdev.uctx)) {
+               kfree(qhp);
+               return ERR_PTR(-ENOMEM);
+       }
+       attrs->cap.max_recv_wr = rqsize - 1;
+       attrs->cap.max_send_wr = sqsize;
+       qhp->rhp = rhp;
+       qhp->attr.pd = php->pdid;
+       qhp->attr.scq = ((struct iwch_cq *) attrs->send_cq)->cq.cqid;
+       qhp->attr.rcq = ((struct iwch_cq *) attrs->recv_cq)->cq.cqid;
+       qhp->attr.sq_num_entries = attrs->cap.max_send_wr;
+       qhp->attr.rq_num_entries = attrs->cap.max_recv_wr;
+       qhp->attr.sq_max_sges = attrs->cap.max_send_sge;
+       qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge;
+       qhp->attr.rq_max_sges = attrs->cap.max_recv_sge;
+       qhp->attr.state = IWCH_QP_STATE_IDLE;
+       qhp->attr.next_state = IWCH_QP_STATE_IDLE;
+
+       /*
+        * XXX - These don't get passed in from the openib user
+        * at create time.  The CM sets them via a QP modify.
+        * Need to fix...  I think the CM should
+        */
+       qhp->attr.enable_rdma_read = 1;
+       qhp->attr.enable_rdma_write = 1;
+       qhp->attr.enable_bind = 1;
+       qhp->attr.max_ord = 1;
+       qhp->attr.max_ird = 1;
+
+       spin_lock_init(&qhp->lock);
+       init_waitqueue_head(&qhp->wait);
+       atomic_set(&qhp->refcnt, 1);
+       insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid);
+
+       if (udata) {
+
+               struct iwch_mm_entry *mm1, *mm2;
+
+               mm1 = kmalloc(sizeof *mm1, GFP_KERNEL);
+               if (!mm1) {
+                       iwch_destroy_qp(&qhp->ibqp);
+                       return ERR_PTR(-ENOMEM);
+               }
+
+               mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
+               if (!mm2) {
+                       kfree(mm1);
+                       iwch_destroy_qp(&qhp->ibqp);
+                       return ERR_PTR(-ENOMEM);
+               }
+
+               uresp.qpid = qhp->wq.qpid;
+               uresp.size_log2 = qhp->wq.size_log2;
+               uresp.sq_size_log2 = qhp->wq.sq_size_log2;
+               uresp.rq_size_log2 = qhp->wq.rq_size_log2;
+               spin_lock(&ucontext->mmap_lock);
+               uresp.key = ucontext->key;
+               ucontext->key += PAGE_SIZE;
+               uresp.db_key = ucontext->key;
+               ucontext->key += PAGE_SIZE;
+               spin_unlock(&ucontext->mmap_lock);
+               if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+                       kfree(mm1);
+                       kfree(mm2);
+                       iwch_destroy_qp(&qhp->ibqp);
+                       return ERR_PTR(-EFAULT);
+               }
+               mm1->key = uresp.key;
+               mm1->addr = virt_to_phys(qhp->wq.queue);
+               mm1->len = PAGE_ALIGN(wqsize * sizeof (union t3_wr));
+               insert_mmap(ucontext, mm1);
+               mm2->key = uresp.db_key;
+               mm2->addr = qhp->wq.udb & PAGE_MASK;
+               mm2->len = PAGE_SIZE;
+               insert_mmap(ucontext, mm2);
+       }
+       qhp->ibqp.qp_num = qhp->wq.qpid;
+       init_timer(&(qhp->timer));
+       PDBG("%s sq_num_entries %d, rq_num_entries %d "
+            "qpid 0x%0x qhp %p dma_addr 0x%llx size %d\n",
+            __FUNCTION__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries,
+            qhp->wq.qpid, qhp, (unsigned long long) qhp->wq.dma_addr,
+            1 << qhp->wq.size_log2);
+       return &qhp->ibqp;
+}
+
+static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                     int attr_mask, struct ib_udata *udata)
+{
+       struct iwch_dev *rhp;
+       struct iwch_qp *qhp;
+       enum iwch_qp_attr_mask mask = 0;
+       struct iwch_qp_attributes attrs;
+
+       PDBG("%s ib_qp %p\n", __FUNCTION__, ibqp);
+
+       /* iwarp does not support the RTR state */
+       if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR))
+               attr_mask &= ~IB_QP_STATE;
+
+       /* Make sure we still have something left to do */
+       if (!attr_mask)
+               return 0;
+
+       memset(&attrs, 0, sizeof attrs);
+       qhp = to_iwch_qp(ibqp);
+       rhp = qhp->rhp;
+
+       attrs.next_state = iwch_convert_state(attr->qp_state);
+       attrs.enable_rdma_read = (attr->qp_access_flags &
+                              IB_ACCESS_REMOTE_READ) ?  1 : 0;
+       attrs.enable_rdma_write = (attr->qp_access_flags &
+                               IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+       attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0;
+
+
+       mask |= (attr_mask & IB_QP_STATE) ? IWCH_QP_ATTR_NEXT_STATE : 0;
+       mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ?
+                       (IWCH_QP_ATTR_ENABLE_RDMA_READ |
+                        IWCH_QP_ATTR_ENABLE_RDMA_WRITE |
+                        IWCH_QP_ATTR_ENABLE_RDMA_BIND) : 0;
+
+       return iwch_modify_qp(rhp, qhp, mask, &attrs, 0);
+}
+
+void iwch_qp_add_ref(struct ib_qp *qp)
+{
+       PDBG("%s ib_qp %p\n", __FUNCTION__, qp);
+       atomic_inc(&(to_iwch_qp(qp)->refcnt));
+}
+
+void iwch_qp_rem_ref(struct ib_qp *qp)
+{
+       PDBG("%s ib_qp %p\n", __FUNCTION__, qp);
+       if (atomic_dec_and_test(&(to_iwch_qp(qp)->refcnt)))
+               wake_up(&(to_iwch_qp(qp)->wait));
+}
+
+struct ib_qp *iwch_get_qp(struct ib_device *dev, int qpn)
+{
+       PDBG("%s ib_dev %p qpn 0x%x\n", __FUNCTION__, dev, qpn);
+       return (struct ib_qp *)get_qhp(to_iwch_dev(dev), qpn);
+}
+
+
+static int iwch_query_pkey(struct ib_device *ibdev,
+                          u8 port, u16 index, u16 * pkey)
+{
+       PDBG("%s ibdev %p\n", __FUNCTION__, ibdev);
+       *pkey = 0;
+       return 0;
+}
+
+static int iwch_query_gid(struct ib_device *ibdev, u8 port,
+                         int index, union ib_gid *gid)
+{
+       struct iwch_dev *dev;
+
+       PDBG("%s ibdev %p, port %d, index %d, gid %p\n",
+              __FUNCTION__, ibdev, port, index, gid);
+       dev = to_iwch_dev(ibdev);
+       BUG_ON(port == 0 || port > 2);
+       memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+       memcpy(&(gid->raw[0]), dev->rdev.port_info.lldevs[port-1]->dev_addr, 6);
+       return 0;
+}
+
+static int iwch_query_device(struct ib_device *ibdev,
+                            struct ib_device_attr *props)
+{
+
+       struct iwch_dev *dev;
+       PDBG("%s ibdev %p\n", __FUNCTION__, ibdev);
+
+       dev = to_iwch_dev(ibdev);
+       memset(props, 0, sizeof *props);
+       memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
+       props->device_cap_flags = dev->device_cap_flags;
+       props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor;
+       props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device;
+       props->max_mr_size = ~0ull;
+       props->max_qp = dev->attr.max_qps;
+       props->max_qp_wr = dev->attr.max_wrs;
+       props->max_sge = dev->attr.max_sge_per_wr;
+       props->max_sge_rd = 1;
+       props->max_qp_rd_atom = dev->attr.max_rdma_reads_per_qp;
+       props->max_cq = dev->attr.max_cqs;
+       props->max_cqe = dev->attr.max_cqes_per_cq;
+       props->max_mr = dev->attr.max_mem_regs;
+       props->max_pd = dev->attr.max_pds;
+       props->local_ca_ack_delay = 0;
+
+       return 0;
+}
+
+static int iwch_query_port(struct ib_device *ibdev,
+                          u8 port, struct ib_port_attr *props)
+{
+       PDBG("%s ibdev %p\n", __FUNCTION__, ibdev);
+       props->max_mtu = IB_MTU_4096;
+       props->lid = 0;
+       props->lmc = 0;
+       props->sm_lid = 0;
+       props->sm_sl = 0;
+       props->state = IB_PORT_ACTIVE;
+       props->phys_state = 0;
+       props->port_cap_flags =
+           IB_PORT_CM_SUP |
+           IB_PORT_SNMP_TUNNEL_SUP |
+           IB_PORT_REINIT_SUP |
+           IB_PORT_DEVICE_MGMT_SUP |
+           IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+       props->gid_tbl_len = 1;
+       props->pkey_tbl_len = 1;
+       props->qkey_viol_cntr = 0;
+       props->active_width = 2;
+       props->active_speed = 2;
+       props->max_msg_sz = -1;
+
+       return 0;
+}
+
+static ssize_t show_rev(struct class_device *cdev, char *buf)
+{
+       struct iwch_dev *dev = container_of(cdev, struct iwch_dev,
+                                           ibdev.class_dev);
+       PDBG("%s class dev 0x%p\n", __FUNCTION__, cdev);
+       return sprintf(buf, "%d\n", dev->rdev.t3cdev_p->type);
+}
+
+static ssize_t show_fw_ver(struct class_device *cdev, char *buf)
+{
+       struct iwch_dev *dev = container_of(cdev, struct iwch_dev,
+                                           ibdev.class_dev);
+       struct ethtool_drvinfo info;
+       struct net_device *lldev = dev->rdev.t3cdev_p->lldev;
+
+       PDBG("%s class dev 0x%p\n", __FUNCTION__, cdev);
+       lldev->ethtool_ops->get_drvinfo(lldev, &info);
+       return sprintf(buf, "%s\n", info.fw_version);
+}
+
+static ssize_t show_hca(struct class_device *cdev, char *buf)
+{
+       struct iwch_dev *dev = container_of(cdev, struct iwch_dev,
+                                           ibdev.class_dev);
+       struct ethtool_drvinfo info;
+       struct net_device *lldev = dev->rdev.t3cdev_p->lldev;
+
+       PDBG("%s class dev 0x%p\n", __FUNCTION__, cdev);
+       lldev->ethtool_ops->get_drvinfo(lldev, &info);
+       return sprintf(buf, "%s\n", info.driver);
+}
+
+static ssize_t show_board(struct class_device *cdev, char *buf)
+{
+       struct iwch_dev *dev = container_of(cdev, struct iwch_dev,
+                                           ibdev.class_dev);
+       PDBG("%s class dev 0x%p\n", __FUNCTION__, dev);
+       return sprintf(buf, "%x.%x\n", dev->rdev.rnic_info.pdev->vendor,
+                                      dev->rdev.rnic_info.pdev->device);
+}
+
+static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static CLASS_DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct class_device_attribute *iwch_class_attributes[] = {
+       &class_device_attr_hw_rev,
+       &class_device_attr_fw_ver,
+       &class_device_attr_hca_type,
+       &class_device_attr_board_id
+};
+
+int iwch_register_device(struct iwch_dev *dev)
+{
+       int ret;
+       int i;
+
+       PDBG("%s iwch_dev %p\n", __FUNCTION__, dev);
+       strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX);
+       memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
+       memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
+       dev->ibdev.owner = THIS_MODULE;
+       dev->device_cap_flags =
+           (IB_DEVICE_ZERO_STAG |
+            IB_DEVICE_SEND_W_INV | IB_DEVICE_MEM_WINDOW);
+
+       dev->ibdev.uverbs_cmd_mask =
+           (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+           (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+           (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+           (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+           (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+           (1ull << IB_USER_VERBS_CMD_REG_MR) |
+           (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+           (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+           (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+           (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+           (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+           (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+           (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+           (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+           (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+           (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+           (1ull << IB_USER_VERBS_CMD_POST_RECV);
+       dev->ibdev.node_type = RDMA_NODE_RNIC;
+       memcpy(dev->ibdev.node_desc, IWCH_NODE_DESC, sizeof(IWCH_NODE_DESC));
+       dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports;
+       dev->ibdev.dma_device = &(dev->rdev.rnic_info.pdev->dev);
+       dev->ibdev.class_dev.dev = &(dev->rdev.rnic_info.pdev->dev);
+       dev->ibdev.query_device = iwch_query_device;
+       dev->ibdev.query_port = iwch_query_port;
+       dev->ibdev.modify_port = iwch_modify_port;
+       dev->ibdev.query_pkey = iwch_query_pkey;
+       dev->ibdev.query_gid = iwch_query_gid;
+       dev->ibdev.alloc_ucontext = iwch_alloc_ucontext;
+       dev->ibdev.dealloc_ucontext = iwch_dealloc_ucontext;
+       dev->ibdev.mmap = iwch_mmap;
+       dev->ibdev.alloc_pd = iwch_allocate_pd;
+       dev->ibdev.dealloc_pd = iwch_deallocate_pd;
+       dev->ibdev.create_ah = iwch_ah_create;
+       dev->ibdev.destroy_ah = iwch_ah_destroy;
+       dev->ibdev.create_qp = iwch_create_qp;
+       dev->ibdev.modify_qp = iwch_ib_modify_qp;
+       dev->ibdev.destroy_qp = iwch_destroy_qp;
+       dev->ibdev.create_cq = iwch_create_cq;
+       dev->ibdev.destroy_cq = iwch_destroy_cq;
+       dev->ibdev.resize_cq = iwch_resize_cq;
+       dev->ibdev.poll_cq = iwch_poll_cq;
+       dev->ibdev.get_dma_mr = iwch_get_dma_mr;
+       dev->ibdev.reg_phys_mr = iwch_register_phys_mem;
+       dev->ibdev.rereg_phys_mr = iwch_reregister_phys_mem;
+       dev->ibdev.reg_user_mr = iwch_reg_user_mr;
+       dev->ibdev.dereg_mr = iwch_dereg_mr;
+       dev->ibdev.alloc_mw = iwch_alloc_mw;
+       dev->ibdev.bind_mw = iwch_bind_mw;
+       dev->ibdev.dealloc_mw = iwch_dealloc_mw;
+
+       dev->ibdev.attach_mcast = iwch_multicast_attach;
+       dev->ibdev.detach_mcast = iwch_multicast_detach;
+       dev->ibdev.process_mad = iwch_process_mad;
+
+       dev->ibdev.req_notify_cq = iwch_arm_cq;
+       dev->ibdev.post_send = iwch_post_send;
+       dev->ibdev.post_recv = iwch_post_receive;
+
+
+       dev->ibdev.iwcm =
+           (struct iw_cm_verbs *) kmalloc(sizeof(struct iw_cm_verbs),
+                                          GFP_KERNEL);
+       dev->ibdev.iwcm->connect = iwch_connect;
+       dev->ibdev.iwcm->accept = iwch_accept_cr;
+       dev->ibdev.iwcm->reject = iwch_reject_cr;
+       dev->ibdev.iwcm->create_listen = iwch_create_listen;
+       dev->ibdev.iwcm->destroy_listen = iwch_destroy_listen;
+       dev->ibdev.iwcm->add_ref = iwch_qp_add_ref;
+       dev->ibdev.iwcm->rem_ref = iwch_qp_rem_ref;
+       dev->ibdev.iwcm->get_qp = iwch_get_qp;
+
+       ret = ib_register_device(&dev->ibdev);
+       if (ret)
+               goto bail1;
+
+       for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) {
+               ret = class_device_create_file(&dev->ibdev.class_dev,
+                                              iwch_class_attributes[i]);
+               if (ret) {
+                       goto bail2;
+               }
+       }
+       return 0;
+bail2:
+       ib_unregister_device(&dev->ibdev);
+bail1:
+       return ret;
+}
+
+void iwch_unregister_device(struct iwch_dev *dev)
+{
+       int i;
+
+       PDBG("%s iwch_dev %p\n", __FUNCTION__, dev);
+       for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i)
+               class_device_remove_file(&dev->ibdev.class_dev,
+                                        iwch_class_attributes[i]);
+       ib_unregister_device(&dev->ibdev);
+       return;
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h
new file mode 100644 (file)
index 0000000..61e3278
--- /dev/null
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IWCH_PROVIDER_H__
+#define __IWCH_PROVIDER_H__
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <rdma/ib_verbs.h>
+#include <asm/types.h>
+#include "t3cdev.h"
+#include "iwch.h"
+#include "cxio_wr.h"
+#include "cxio_hal.h"
+
+struct iwch_pd {
+       struct ib_pd ibpd;
+       u32 pdid;
+       struct iwch_dev *rhp;
+};
+
+static inline struct iwch_pd *to_iwch_pd(struct ib_pd *ibpd)
+{
+       return container_of(ibpd, struct iwch_pd, ibpd);
+}
+
+struct tpt_attributes {
+       u32 stag;
+       u32 state:1;
+       u32 type:2;
+       u32 rsvd:1;
+       enum tpt_mem_perm perms;
+       u32 remote_invaliate_disable:1;
+       u32 zbva:1;
+       u32 mw_bind_enable:1;
+       u32 page_size:5;
+
+       u32 pdid;
+       u32 qpid;
+       u32 pbl_addr;
+       u32 len;
+       u64 va_fbo;
+       u32 pbl_size;
+};
+
+struct iwch_mr {
+       struct ib_mr ibmr;
+       struct iwch_dev *rhp;
+       u64 kva;
+       struct tpt_attributes attr;
+};
+
+typedef struct iwch_mw iwch_mw_handle;
+
+static inline struct iwch_mr *to_iwch_mr(struct ib_mr *ibmr)
+{
+       return container_of(ibmr, struct iwch_mr, ibmr);
+}
+
+struct iwch_mw {
+       struct ib_mw ibmw;
+       struct iwch_dev *rhp;
+       u64 kva;
+       struct tpt_attributes attr;
+};
+
+static inline struct iwch_mw *to_iwch_mw(struct ib_mw *ibmw)
+{
+       return container_of(ibmw, struct iwch_mw, ibmw);
+}
+
+struct iwch_cq {
+       struct ib_cq ibcq;
+       struct iwch_dev *rhp;
+       struct t3_cq cq;
+       spinlock_t lock;
+       atomic_t refcnt;
+       wait_queue_head_t wait;
+       u32 __user *user_rptr_addr;
+};
+
+static inline struct iwch_cq *to_iwch_cq(struct ib_cq *ibcq)
+{
+       return container_of(ibcq, struct iwch_cq, ibcq);
+}
+
+enum IWCH_QP_FLAGS {
+       QP_QUIESCED = 0x01
+};
+
+struct iwch_mpa_attributes {
+       u8 recv_marker_enabled;
+       u8 xmit_marker_enabled; /* iWARP: enable inbound Read Resp. */
+       u8 crc_enabled;
+       u8 version;     /* 0 or 1 */
+};
+
+struct iwch_qp_attributes {
+       u32 scq;
+       u32 rcq;
+       u32 sq_num_entries;
+       u32 rq_num_entries;
+       u32 sq_max_sges;
+       u32 sq_max_sges_rdma_write;
+       u32 rq_max_sges;
+       u32 state;
+       u8 enable_rdma_read;
+       u8 enable_rdma_write;   /* enable inbound Read Resp. */
+       u8 enable_bind;
+       u8 enable_mmid0_fastreg;        /* Enable STAG0 + Fast-register */
+       /*
+        * Next QP state. If specify the current state, only the
+        * QP attributes will be modified.
+        */
+       u32 max_ord;
+       u32 max_ird;
+       u32 pd; /* IN */
+       u32 next_state;
+       char terminate_buffer[52];
+       u32 terminate_msg_len;
+       u8 is_terminate_local;
+       struct iwch_mpa_attributes mpa_attr;    /* IN-OUT */
+       struct iwch_ep *llp_stream_handle;
+       char *stream_msg_buf;   /* Last stream msg. before Idle -> RTS */
+       u32 stream_msg_buf_len; /* Only on Idle -> RTS */
+};
+
+struct iwch_qp {
+       struct ib_qp ibqp;
+       struct iwch_dev *rhp;
+       struct iwch_ep *ep;
+       struct iwch_qp_attributes attr;
+       struct t3_wq wq;
+       spinlock_t lock;
+       atomic_t refcnt;
+       wait_queue_head_t wait;
+       enum IWCH_QP_FLAGS flags;
+       struct timer_list timer;
+};
+
+static inline int qp_quiesced(struct iwch_qp *qhp)
+{
+       return qhp->flags & QP_QUIESCED;
+}
+
+static inline struct iwch_qp *to_iwch_qp(struct ib_qp *ibqp)
+{
+       return container_of(ibqp, struct iwch_qp, ibqp);
+}
+
+void iwch_qp_add_ref(struct ib_qp *qp);
+void iwch_qp_rem_ref(struct ib_qp *qp);
+struct ib_qp *iwch_get_qp(struct ib_device *dev, int qpn);
+
+struct iwch_ucontext {
+       struct ib_ucontext ibucontext;
+       struct cxio_ucontext uctx;
+       u32 key;
+       spinlock_t mmap_lock;
+       struct list_head mmaps;
+};
+
+static inline struct iwch_ucontext *to_iwch_ucontext(struct ib_ucontext *c)
+{
+       return container_of(c, struct iwch_ucontext, ibucontext);
+}
+
+struct iwch_mm_entry {
+       struct list_head entry;
+       u64 addr;
+       u32 key;
+       unsigned len;
+};
+
+static inline struct iwch_mm_entry *remove_mmap(struct iwch_ucontext *ucontext,
+                                               u32 key, unsigned len)
+{
+       struct list_head *pos, *nxt;
+       struct iwch_mm_entry *mm;
+
+       spin_lock(&ucontext->mmap_lock);
+       list_for_each_safe(pos, nxt, &ucontext->mmaps) {
+
+               mm = list_entry(pos, struct iwch_mm_entry, entry);
+               if (mm->key == key && mm->len == len) {
+                       list_del_init(&mm->entry);
+                       spin_unlock(&ucontext->mmap_lock);
+                       PDBG("%s key 0x%x addr 0x%llx len %d\n", __FUNCTION__,
+                            key, (unsigned long long) mm->addr, mm->len);
+                       return mm;
+               }
+       }
+       spin_unlock(&ucontext->mmap_lock);
+       return NULL;
+}
+
+static inline void insert_mmap(struct iwch_ucontext *ucontext,
+                              struct iwch_mm_entry *mm)
+{
+       spin_lock(&ucontext->mmap_lock);
+       PDBG("%s key 0x%x addr 0x%llx len %d\n", __FUNCTION__,
+            mm->key, (unsigned long long) mm->addr, mm->len);
+       list_add_tail(&mm->entry, &ucontext->mmaps);
+       spin_unlock(&ucontext->mmap_lock);
+}
+
+enum iwch_qp_attr_mask {
+       IWCH_QP_ATTR_NEXT_STATE = 1 << 0,
+       IWCH_QP_ATTR_ENABLE_RDMA_READ = 1 << 7,
+       IWCH_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8,
+       IWCH_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9,
+       IWCH_QP_ATTR_MAX_ORD = 1 << 11,
+       IWCH_QP_ATTR_MAX_IRD = 1 << 12,
+       IWCH_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22,
+       IWCH_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23,
+       IWCH_QP_ATTR_MPA_ATTR = 1 << 24,
+       IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25,
+       IWCH_QP_ATTR_VALID_MODIFY = (IWCH_QP_ATTR_ENABLE_RDMA_READ |
+                                    IWCH_QP_ATTR_ENABLE_RDMA_WRITE |
+                                    IWCH_QP_ATTR_MAX_ORD |
+                                    IWCH_QP_ATTR_MAX_IRD |
+                                    IWCH_QP_ATTR_LLP_STREAM_HANDLE |
+                                    IWCH_QP_ATTR_STREAM_MSG_BUFFER |
+                                    IWCH_QP_ATTR_MPA_ATTR |
+                                    IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE)
+};
+
+int iwch_modify_qp(struct iwch_dev *rhp,
+                               struct iwch_qp *qhp,
+                               enum iwch_qp_attr_mask mask,
+                               struct iwch_qp_attributes *attrs,
+                               int internal);
+
+enum iwch_qp_state {
+       IWCH_QP_STATE_IDLE,
+       IWCH_QP_STATE_RTS,
+       IWCH_QP_STATE_ERROR,
+       IWCH_QP_STATE_TERMINATE,
+       IWCH_QP_STATE_CLOSING,
+       IWCH_QP_STATE_TOT
+};
+
+static inline int iwch_convert_state(enum ib_qp_state ib_state)
+{
+       switch (ib_state) {
+       case IB_QPS_RESET:
+       case IB_QPS_INIT:
+               return IWCH_QP_STATE_IDLE;
+       case IB_QPS_RTS:
+               return IWCH_QP_STATE_RTS;
+       case IB_QPS_SQD:
+               return IWCH_QP_STATE_CLOSING;
+       case IB_QPS_SQE:
+               return IWCH_QP_STATE_TERMINATE;
+       case IB_QPS_ERR:
+               return IWCH_QP_STATE_ERROR;
+       default:
+               return -1;
+       }
+}
+
+enum iwch_mem_perms {
+       IWCH_MEM_ACCESS_LOCAL_READ = 1 << 0,
+       IWCH_MEM_ACCESS_LOCAL_WRITE = 1 << 1,
+       IWCH_MEM_ACCESS_REMOTE_READ = 1 << 2,
+       IWCH_MEM_ACCESS_REMOTE_WRITE = 1 << 3,
+       IWCH_MEM_ACCESS_ATOMICS = 1 << 4,
+       IWCH_MEM_ACCESS_BINDING = 1 << 5,
+       IWCH_MEM_ACCESS_LOCAL =
+           (IWCH_MEM_ACCESS_LOCAL_READ | IWCH_MEM_ACCESS_LOCAL_WRITE),
+       IWCH_MEM_ACCESS_REMOTE =
+           (IWCH_MEM_ACCESS_REMOTE_WRITE | IWCH_MEM_ACCESS_REMOTE_READ)
+           /* cannot go beyond 1 << 31 */
+} __attribute__ ((packed));
+
+static inline u32 iwch_convert_access(int acc)
+{
+       return (acc & IB_ACCESS_REMOTE_WRITE ? IWCH_MEM_ACCESS_REMOTE_WRITE : 0)
+           | (acc & IB_ACCESS_REMOTE_READ ? IWCH_MEM_ACCESS_REMOTE_READ : 0) |
+           (acc & IB_ACCESS_LOCAL_WRITE ? IWCH_MEM_ACCESS_LOCAL_WRITE : 0) |
+           (acc & IB_ACCESS_MW_BIND ? IWCH_MEM_ACCESS_BINDING : 0) |
+           IWCH_MEM_ACCESS_LOCAL_READ;
+}
+
+enum iwch_mmid_state {
+       IWCH_STAG_STATE_VALID,
+       IWCH_STAG_STATE_INVALID
+};
+
+enum iwch_qp_query_flags {
+       IWCH_QP_QUERY_CONTEXT_NONE = 0x0,       /* No ctx; Only attrs */
+       IWCH_QP_QUERY_CONTEXT_GET = 0x1,        /* Get ctx + attrs */
+       IWCH_QP_QUERY_CONTEXT_SUSPEND = 0x2,    /* Not Supported */
+
+       /*
+        * Quiesce QP context; Consumer
+        * will NOT replay outstanding WR
+        */
+       IWCH_QP_QUERY_CONTEXT_QUIESCE = 0x4,
+       IWCH_QP_QUERY_CONTEXT_REMOVE = 0x8,
+       IWCH_QP_QUERY_TEST_USERWRITE = 0x32     /* Test special */
+};
+
+int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+                     struct ib_send_wr **bad_wr);
+int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+                     struct ib_recv_wr **bad_wr);
+int iwch_bind_mw(struct ib_qp *qp,
+                            struct ib_mw *mw,
+                            struct ib_mw_bind *mw_bind);
+int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg);
+int iwch_register_device(struct iwch_dev *dev);
+void iwch_unregister_device(struct iwch_dev *dev);
+int iwch_quiesce_qps(struct iwch_cq *chp);
+int iwch_resume_qps(struct iwch_cq *chp);
+void stop_read_rep_timer(struct iwch_qp *qhp);
+int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+                                       struct iwch_mr *mhp,
+                                       int shift,
+                                       __be64 *page_list);
+int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+                                       struct iwch_mr *mhp,
+                                       int shift,
+                                       __be64 *page_list,
+                                       int npages);
+int build_phys_page_list(struct ib_phys_buf *buffer_list,
+                                       int num_phys_buf,
+                                       u64 *iova_start,
+                                       u64 *total_size,
+                                       int *npages,
+                                       int *shift,
+                                       __be64 **page_list);
+
+
+#define IWCH_NODE_DESC "cxgb3 Chelsio Communications"
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
new file mode 100644 (file)
index 0000000..e066727
--- /dev/null
@@ -0,0 +1,1007 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "iwch_provider.h"
+#include "iwch.h"
+#include "iwch_cm.h"
+#include "cxio_hal.h"
+
+#define NO_SUPPORT -1
+
+static inline int iwch_build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr,
+                                      u8 * flit_cnt)
+{
+       int i;
+       u32 plen;
+
+       switch (wr->opcode) {
+       case IB_WR_SEND:
+       case IB_WR_SEND_WITH_IMM:
+               if (wr->send_flags & IB_SEND_SOLICITED)
+                       wqe->send.rdmaop = T3_SEND_WITH_SE;
+               else
+                       wqe->send.rdmaop = T3_SEND;
+               wqe->send.rem_stag = 0;
+               break;
+#if 0                          /* Not currently supported */
+       case TYPE_SEND_INVALIDATE:
+       case TYPE_SEND_INVALIDATE_IMMEDIATE:
+               wqe->send.rdmaop = T3_SEND_WITH_INV;
+               wqe->send.rem_stag = cpu_to_be32(wr->wr.rdma.rkey);
+               break;
+       case TYPE_SEND_SE_INVALIDATE:
+               wqe->send.rdmaop = T3_SEND_WITH_SE_INV;
+               wqe->send.rem_stag = cpu_to_be32(wr->wr.rdma.rkey);
+               break;
+#endif
+       default:
+               break;
+       }
+       if (wr->num_sge > T3_MAX_SGE)
+               return -EINVAL;
+       wqe->send.reserved[0] = 0;
+       wqe->send.reserved[1] = 0;
+       wqe->send.reserved[2] = 0;
+       if (wr->opcode == IB_WR_SEND_WITH_IMM) {
+               plen = 4;
+               wqe->send.sgl[0].stag = wr->imm_data;
+               wqe->send.sgl[0].len = __constant_cpu_to_be32(0);
+               wqe->send.num_sgle = __constant_cpu_to_be32(0);
+               *flit_cnt = 5;
+       } else {
+               plen = 0;
+               for (i = 0; i < wr->num_sge; i++) {
+                       if ((plen + wr->sg_list[i].length) < plen) {
+                               return -EMSGSIZE;
+                       }
+                       plen += wr->sg_list[i].length;
+                       wqe->send.sgl[i].stag =
+                           cpu_to_be32(wr->sg_list[i].lkey);
+                       wqe->send.sgl[i].len =
+                           cpu_to_be32(wr->sg_list[i].length);
+                       wqe->send.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
+               }
+               wqe->send.num_sgle = cpu_to_be32(wr->num_sge);
+               *flit_cnt = 4 + ((wr->num_sge) << 1);
+       }
+       wqe->send.plen = cpu_to_be32(plen);
+       return 0;
+}
+
+static inline int iwch_build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr,
+                                       u8 *flit_cnt)
+{
+       int i;
+       u32 plen;
+       if (wr->num_sge > T3_MAX_SGE)
+               return -EINVAL;
+       wqe->write.rdmaop = T3_RDMA_WRITE;
+       wqe->write.reserved[0] = 0;
+       wqe->write.reserved[1] = 0;
+       wqe->write.reserved[2] = 0;
+       wqe->write.stag_sink = cpu_to_be32(wr->wr.rdma.rkey);
+       wqe->write.to_sink = cpu_to_be64(wr->wr.rdma.remote_addr);
+
+       if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+               plen = 4;
+               wqe->write.sgl[0].stag = wr->imm_data;
+               wqe->write.sgl[0].len = __constant_cpu_to_be32(0);
+               wqe->write.num_sgle = __constant_cpu_to_be32(0);
+               *flit_cnt = 6;
+       } else {
+               plen = 0;
+               for (i = 0; i < wr->num_sge; i++) {
+                       if ((plen + wr->sg_list[i].length) < plen) {
+                               return -EMSGSIZE;
+                       }
+                       plen += wr->sg_list[i].length;
+                       wqe->write.sgl[i].stag =
+                           cpu_to_be32(wr->sg_list[i].lkey);
+                       wqe->write.sgl[i].len =
+                           cpu_to_be32(wr->sg_list[i].length);
+                       wqe->write.sgl[i].to =
+                           cpu_to_be64(wr->sg_list[i].addr);
+               }
+               wqe->write.num_sgle = cpu_to_be32(wr->num_sge);
+               *flit_cnt = 5 + ((wr->num_sge) << 1);
+       }
+       wqe->write.plen = cpu_to_be32(plen);
+       return 0;
+}
+
+static inline int iwch_build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr,
+                                      u8 *flit_cnt)
+{
+       if (wr->num_sge > 1)
+               return -EINVAL;
+       wqe->read.rdmaop = T3_READ_REQ;
+       wqe->read.reserved[0] = 0;
+       wqe->read.reserved[1] = 0;
+       wqe->read.reserved[2] = 0;
+       wqe->read.rem_stag = cpu_to_be32(wr->wr.rdma.rkey);
+       wqe->read.rem_to = cpu_to_be64(wr->wr.rdma.remote_addr);
+       wqe->read.local_stag = cpu_to_be32(wr->sg_list[0].lkey);
+       wqe->read.local_len = cpu_to_be32(wr->sg_list[0].length);
+       wqe->read.local_to = cpu_to_be64(wr->sg_list[0].addr);
+       *flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3;
+       return 0;
+}
+
+/*
+ * TBD: this is going to be moved to firmware. Missing pdid/qpid check for now.
+ */
+static inline int iwch_sgl2pbl_map(struct iwch_dev *rhp,
+                                  struct ib_sge *sg_list, u32 num_sgle,
+                                  u32 * pbl_addr, u8 * page_size)
+{
+       int i;
+       struct iwch_mr *mhp;
+       u32 offset;
+       for (i = 0; i < num_sgle; i++) {
+
+               mhp = get_mhp(rhp, (sg_list[i].lkey) >> 8);
+               if (!mhp) {
+                       PDBG("%s %d\n", __FUNCTION__, __LINE__);
+                       return -EIO;
+               }
+               if (!mhp->attr.state) {
+                       PDBG("%s %d\n", __FUNCTION__, __LINE__);
+                       return -EIO;
+               }
+               if (mhp->attr.zbva) {
+                       PDBG("%s %d\n", __FUNCTION__, __LINE__);
+                       return -EIO;
+               }
+
+               if (sg_list[i].addr < mhp->attr.va_fbo) {
+                       PDBG("%s %d\n", __FUNCTION__, __LINE__);
+                       return -EINVAL;
+               }
+               if (sg_list[i].addr + ((u64) sg_list[i].length) <
+                   sg_list[i].addr) {
+                       PDBG("%s %d\n", __FUNCTION__, __LINE__);
+                       return -EINVAL;
+               }
+               if (sg_list[i].addr + ((u64) sg_list[i].length) >
+                   mhp->attr.va_fbo + ((u64) mhp->attr.len)) {
+                       PDBG("%s %d\n", __FUNCTION__, __LINE__);
+                       return -EINVAL;
+               }
+               offset = sg_list[i].addr - mhp->attr.va_fbo;
+               offset += ((u32) mhp->attr.va_fbo) %
+                         (1UL << (12 + mhp->attr.page_size));
+               pbl_addr[i] = ((mhp->attr.pbl_addr -
+                               rhp->rdev.rnic_info.pbl_base) >> 3) +
+                             (offset >> (12 + mhp->attr.page_size));
+               page_size[i] = mhp->attr.page_size;
+       }
+       return 0;
+}
+
+static inline int iwch_build_rdma_recv(struct iwch_dev *rhp,
+                                                   union t3_wr *wqe,
+                                                   struct ib_recv_wr *wr)
+{
+       int i, err = 0;
+       u32 pbl_addr[4];
+       u8 page_size[4];
+       if (wr->num_sge > T3_MAX_SGE)
+               return -EINVAL;
+       err = iwch_sgl2pbl_map(rhp, wr->sg_list, wr->num_sge, pbl_addr,
+                              page_size);
+       if (err)
+               return err;
+       wqe->recv.pagesz[0] = page_size[0];
+       wqe->recv.pagesz[1] = page_size[1];
+       wqe->recv.pagesz[2] = page_size[2];
+       wqe->recv.pagesz[3] = page_size[3];
+       wqe->recv.num_sgle = cpu_to_be32(wr->num_sge);
+       for (i = 0; i < wr->num_sge; i++) {
+               wqe->recv.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey);
+               wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
+
+               /* to in the WQE == the offset into the page */
+               wqe->recv.sgl[i].to = cpu_to_be64(((u32) wr->sg_list[i].addr) %
+                               (1UL << (12 + page_size[i])));
+
+               /* pbl_addr is the adapters address in the PBL */
+               wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_addr[i]);
+       }
+       for (; i < T3_MAX_SGE; i++) {
+               wqe->recv.sgl[i].stag = 0;
+               wqe->recv.sgl[i].len = 0;
+               wqe->recv.sgl[i].to = 0;
+               wqe->recv.pbl_addr[i] = 0;
+       }
+       return 0;
+}
+
+int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+                     struct ib_send_wr **bad_wr)
+{
+       int err = 0;
+       u8 t3_wr_flit_cnt;
+       enum t3_wr_opcode t3_wr_opcode = 0;
+       enum t3_wr_flags t3_wr_flags;
+       struct iwch_qp *qhp;
+       u32 idx;
+       union t3_wr *wqe;
+       u32 num_wrs;
+       unsigned long flag;
+       struct t3_swsq *sqp;
+
+       qhp = to_iwch_qp(ibqp);
+       spin_lock_irqsave(&qhp->lock, flag);
+       if (qhp->attr.state > IWCH_QP_STATE_RTS) {
+               spin_unlock_irqrestore(&qhp->lock, flag);
+               return -EINVAL;
+       }
+       num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr,
+                 qhp->wq.sq_size_log2);
+       if (num_wrs <= 0) {
+               spin_unlock_irqrestore(&qhp->lock, flag);
+               return -ENOMEM;
+       }
+       while (wr) {
+               if (num_wrs == 0) {
+                       err = -ENOMEM;
+                       *bad_wr = wr;
+                       break;
+               }
+               idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
+               wqe = (union t3_wr *) (qhp->wq.queue + idx);
+               t3_wr_flags = 0;
+               if (wr->send_flags & IB_SEND_SOLICITED)
+                       t3_wr_flags |= T3_SOLICITED_EVENT_FLAG;
+               if (wr->send_flags & IB_SEND_FENCE)
+                       t3_wr_flags |= T3_READ_FENCE_FLAG;
+               if (wr->send_flags & IB_SEND_SIGNALED)
+                       t3_wr_flags |= T3_COMPLETION_FLAG;
+               sqp = qhp->wq.sq +
+                     Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
+               switch (wr->opcode) {
+               case IB_WR_SEND:
+               case IB_WR_SEND_WITH_IMM:
+                       t3_wr_opcode = T3_WR_SEND;
+                       err = iwch_build_rdma_send(wqe, wr, &t3_wr_flit_cnt);
+                       break;
+               case IB_WR_RDMA_WRITE:
+               case IB_WR_RDMA_WRITE_WITH_IMM:
+                       t3_wr_opcode = T3_WR_WRITE;
+                       err = iwch_build_rdma_write(wqe, wr, &t3_wr_flit_cnt);
+                       break;
+               case IB_WR_RDMA_READ:
+                       t3_wr_opcode = T3_WR_READ;
+                       t3_wr_flags = 0; /* T3 reads are always signaled */
+                       err = iwch_build_rdma_read(wqe, wr, &t3_wr_flit_cnt);
+                       if (err)
+                               break;
+                       sqp->read_len = wqe->read.local_len;
+                       if (!qhp->wq.oldest_read)
+                               qhp->wq.oldest_read = sqp;
+                       break;
+               default:
+                       PDBG("%s post of type=%d TBD!\n", __FUNCTION__,
+                            wr->opcode);
+                       err = -EINVAL;
+               }
+               if (err) {
+                       *bad_wr = wr;
+                       break;
+               }
+               wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
+               sqp->wr_id = wr->wr_id;
+               sqp->opcode = wr2opcode(t3_wr_opcode);
+               sqp->sq_wptr = qhp->wq.sq_wptr;
+               sqp->complete = 0;
+               sqp->signaled = (wr->send_flags & IB_SEND_SIGNALED);
+
+               build_fw_riwrh((void *) wqe, t3_wr_opcode, t3_wr_flags,
+                              Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
+                              0, t3_wr_flit_cnt);
+               PDBG("%s cookie 0x%llx wq idx 0x%x swsq idx %ld opcode %d\n",
+                    __FUNCTION__, (unsigned long long) wr->wr_id, idx,
+                    Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2),
+                    sqp->opcode);
+               wr = wr->next;
+               num_wrs--;
+               ++(qhp->wq.wptr);
+               ++(qhp->wq.sq_wptr);
+       }
+       spin_unlock_irqrestore(&qhp->lock, flag);
+       ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+       return err;
+}
+
+int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+                     struct ib_recv_wr **bad_wr)
+{
+       int err = 0;
+       struct iwch_qp *qhp;
+       u32 idx;
+       union t3_wr *wqe;
+       u32 num_wrs;
+       unsigned long flag;
+
+       qhp = to_iwch_qp(ibqp);
+       spin_lock_irqsave(&qhp->lock, flag);
+       if (qhp->attr.state > IWCH_QP_STATE_RTS) {
+               spin_unlock_irqrestore(&qhp->lock, flag);
+               return -EINVAL;
+       }
+       num_wrs = Q_FREECNT(qhp->wq.rq_rptr, qhp->wq.rq_wptr,
+                           qhp->wq.rq_size_log2) - 1;
+       if (!wr) {
+               spin_unlock_irqrestore(&qhp->lock, flag);
+               return -EINVAL;
+       }
+       while (wr) {
+               idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
+               wqe = (union t3_wr *) (qhp->wq.queue + idx);
+               if (num_wrs)
+                       err = iwch_build_rdma_recv(qhp->rhp, wqe, wr);
+               else
+                       err = -ENOMEM;
+               if (err) {
+                       *bad_wr = wr;
+                       break;
+               }
+               qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr, qhp->wq.rq_size_log2)] =
+                       wr->wr_id;
+               build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG,
+                              Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
+                              0, sizeof(struct t3_receive_wr) >> 3);
+               PDBG("%s cookie 0x%llx idx 0x%x rq_wptr 0x%x rw_rptr 0x%x "
+                    "wqe %p \n", __FUNCTION__, (unsigned long long) wr->wr_id,
+                    idx, qhp->wq.rq_wptr, qhp->wq.rq_rptr, wqe);
+               ++(qhp->wq.rq_wptr);
+               ++(qhp->wq.wptr);
+               wr = wr->next;
+               num_wrs--;
+       }
+       spin_unlock_irqrestore(&qhp->lock, flag);
+       ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+       return err;
+}
+
+int iwch_bind_mw(struct ib_qp *qp,
+                            struct ib_mw *mw,
+                            struct ib_mw_bind *mw_bind)
+{
+       struct iwch_dev *rhp;
+       struct iwch_mw *mhp;
+       struct iwch_qp *qhp;
+       union t3_wr *wqe;
+       u32 pbl_addr;
+       u8 page_size;
+       u32 num_wrs;
+       unsigned long flag;
+       struct ib_sge sgl;
+       int err=0;
+       enum t3_wr_flags t3_wr_flags;
+       u32 idx;
+       struct t3_swsq *sqp;
+
+       qhp = to_iwch_qp(qp);
+       mhp = to_iwch_mw(mw);
+       rhp = qhp->rhp;
+
+       spin_lock_irqsave(&qhp->lock, flag);
+       if (qhp->attr.state > IWCH_QP_STATE_RTS) {
+               spin_unlock_irqrestore(&qhp->lock, flag);
+               return -EINVAL;
+       }
+       num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr,
+                           qhp->wq.sq_size_log2);
+       if ((num_wrs) <= 0) {
+               spin_unlock_irqrestore(&qhp->lock, flag);
+               return -ENOMEM;
+       }
+       idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
+       PDBG("%s: idx 0x%0x, mw 0x%p, mw_bind 0x%p\n", __FUNCTION__, idx,
+            mw, mw_bind);
+       wqe = (union t3_wr *) (qhp->wq.queue + idx);
+
+       t3_wr_flags = 0;
+       if (mw_bind->send_flags & IB_SEND_SIGNALED)
+               t3_wr_flags = T3_COMPLETION_FLAG;
+
+       sgl.addr = mw_bind->addr;
+       sgl.lkey = mw_bind->mr->lkey;
+       sgl.length = mw_bind->length;
+       wqe->bind.reserved = 0;
+       wqe->bind.type = T3_VA_BASED_TO;
+
+       /* TBD: check perms */
+       wqe->bind.perms = iwch_convert_access(mw_bind->mw_access_flags);
+       wqe->bind.mr_stag = cpu_to_be32(mw_bind->mr->lkey);
+       wqe->bind.mw_stag = cpu_to_be32(mw->rkey);
+       wqe->bind.mw_len = cpu_to_be32(mw_bind->length);
+       wqe->bind.mw_va = cpu_to_be64(mw_bind->addr);
+       err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size);
+       if (err) {
+               spin_unlock_irqrestore(&qhp->lock, flag);
+               return err;
+       }
+       wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
+       sqp = qhp->wq.sq + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
+       sqp->wr_id = mw_bind->wr_id;
+       sqp->opcode = T3_BIND_MW;
+       sqp->sq_wptr = qhp->wq.sq_wptr;
+       sqp->complete = 0;
+       sqp->signaled = (mw_bind->send_flags & IB_SEND_SIGNALED);
+       wqe->bind.mr_pbl_addr = cpu_to_be32(pbl_addr);
+       wqe->bind.mr_pagesz = page_size;
+       wqe->flit[T3_SQ_COOKIE_FLIT] = mw_bind->wr_id;
+       build_fw_riwrh((void *)wqe, T3_WR_BIND, t3_wr_flags,
+                      Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0,
+                               sizeof(struct t3_bind_mw_wr) >> 3);
+       ++(qhp->wq.wptr);
+       ++(qhp->wq.sq_wptr);
+       spin_unlock_irqrestore(&qhp->lock, flag);
+
+       ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+
+       return err;
+}
+
+static inline void build_term_codes(int t3err, u8 *layer_type, u8 *ecode,
+                                   int tagged)
+{
+       switch (t3err) {
+       case TPT_ERR_STAG:
+               if (tagged == 1) {
+                       *layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+                       *ecode = DDPT_INV_STAG;
+               } else if (tagged == 2) {
+                       *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+                       *ecode = RDMAP_INV_STAG;
+               }
+               break;
+       case TPT_ERR_PDID:
+       case TPT_ERR_QPID:
+       case TPT_ERR_ACCESS:
+               if (tagged == 1) {
+                       *layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+                       *ecode = DDPT_STAG_NOT_ASSOC;
+               } else if (tagged == 2) {
+                       *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+                       *ecode = RDMAP_STAG_NOT_ASSOC;
+               }
+               break;
+       case TPT_ERR_WRAP:
+               *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+               *ecode = RDMAP_TO_WRAP;
+               break;
+       case TPT_ERR_BOUND:
+               if (tagged == 1) {
+                       *layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+                       *ecode = DDPT_BASE_BOUNDS;
+               } else if (tagged == 2) {
+                       *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+                       *ecode = RDMAP_BASE_BOUNDS;
+               } else {
+                       *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+                       *ecode = DDPU_MSG_TOOBIG;
+               }
+               break;
+       case TPT_ERR_INVALIDATE_SHARED_MR:
+       case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+               *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+               *ecode = RDMAP_CANT_INV_STAG;
+               break;
+       case TPT_ERR_ECC:
+       case TPT_ERR_ECC_PSTAG:
+       case TPT_ERR_INTERNAL_ERR:
+               *layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA;
+               *ecode = 0;
+               break;
+       case TPT_ERR_OUT_OF_RQE:
+               *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+               *ecode = DDPU_INV_MSN_NOBUF;
+               break;
+       case TPT_ERR_PBL_ADDR_BOUND:
+               *layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+               *ecode = DDPT_BASE_BOUNDS;
+               break;
+       case TPT_ERR_CRC:
+               *layer_type = LAYER_MPA|DDP_LLP;
+               *ecode = MPA_CRC_ERR;
+               break;
+       case TPT_ERR_MARKER:
+               *layer_type = LAYER_MPA|DDP_LLP;
+               *ecode = MPA_MARKER_ERR;
+               break;
+       case TPT_ERR_PDU_LEN_ERR:
+               *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+               *ecode = DDPU_MSG_TOOBIG;
+               break;
+       case TPT_ERR_DDP_VERSION:
+               if (tagged) {
+                       *layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+                       *ecode = DDPT_INV_VERS;
+               } else {
+                       *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+                       *ecode = DDPU_INV_VERS;
+               }
+               break;
+       case TPT_ERR_RDMA_VERSION:
+               *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+               *ecode = RDMAP_INV_VERS;
+               break;
+       case TPT_ERR_OPCODE:
+               *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+               *ecode = RDMAP_INV_OPCODE;
+               break;
+       case TPT_ERR_DDP_QUEUE_NUM:
+               *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+               *ecode = DDPU_INV_QN;
+               break;
+       case TPT_ERR_MSN:
+       case TPT_ERR_MSN_GAP:
+       case TPT_ERR_MSN_RANGE:
+       case TPT_ERR_IRD_OVERFLOW:
+               *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+               *ecode = DDPU_INV_MSN_RANGE;
+               break;
+       case TPT_ERR_TBIT:
+               *layer_type = LAYER_DDP|DDP_LOCAL_CATA;
+               *ecode = 0;
+               break;
+       case TPT_ERR_MO:
+               *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+               *ecode = DDPU_INV_MO;
+               break;
+       default:
+               *layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
+               *ecode = 0;
+               break;
+       }
+}
+
+/*
+ * This posts a TERMINATE with layer=RDMA, type=catastrophic.
+ */
+int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg)
+{
+       union t3_wr *wqe;
+       struct terminate_message *term;
+       int status;
+       int tagged = 0;
+       struct sk_buff *skb;
+
+       PDBG("%s %d\n", __FUNCTION__, __LINE__);
+       skb = alloc_skb(40, GFP_ATOMIC);
+       if (!skb) {
+               printk(KERN_ERR "%s cannot send TERMINATE!\n", __FUNCTION__);
+               return -ENOMEM;
+       }
+       wqe = (union t3_wr *)skb_put(skb, 40);
+       memset(wqe, 0, 40);
+       wqe->send.rdmaop = T3_TERMINATE;
+
+       /* immediate data length */
+       wqe->send.plen = htonl(4);
+
+       /* immediate data starts here. */
+       term = (struct terminate_message *)wqe->send.sgl;
+       if (rsp_msg) {
+               status = CQE_STATUS(rsp_msg->cqe);
+               if (CQE_OPCODE(rsp_msg->cqe) == T3_RDMA_WRITE)
+                       tagged = 1;
+               if ((CQE_OPCODE(rsp_msg->cqe) == T3_READ_REQ) ||
+                   (CQE_OPCODE(rsp_msg->cqe) == T3_READ_RESP))
+                       tagged = 2;
+       } else {
+               status = TPT_ERR_INTERNAL_ERR;
+       }
+       build_term_codes(status, &term->layer_etype, &term->ecode, tagged);
+       build_fw_riwrh((void *)wqe, T3_WR_SEND,
+                      T3_COMPLETION_FLAG | T3_NOTIFY_FLAG, 1,
+                      qhp->ep->hwtid, 5);
+       skb->priority = CPL_PRIORITY_DATA;
+       return cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb);
+}
+
+/*
+ * Assumes qhp lock is held.
+ */
+static void __flush_qp(struct iwch_qp *qhp, unsigned long *flag)
+{
+       struct iwch_cq *rchp, *schp;
+       int count;
+
+       rchp = get_chp(qhp->rhp, qhp->attr.rcq);
+       schp = get_chp(qhp->rhp, qhp->attr.scq);
+
+       PDBG("%s qhp %p rchp %p schp %p\n", __FUNCTION__, qhp, rchp, schp);
+       /* take a ref on the qhp since we must release the lock */
+       atomic_inc(&qhp->refcnt);
+       spin_unlock_irqrestore(&qhp->lock, *flag);
+
+       /* locking heirarchy: cq lock first, then qp lock. */
+       spin_lock_irqsave(&rchp->lock, *flag);
+       spin_lock(&qhp->lock);
+       cxio_flush_hw_cq(&rchp->cq);
+       cxio_count_rcqes(&rchp->cq, &qhp->wq, &count);
+       cxio_flush_rq(&qhp->wq, &rchp->cq, count);
+       spin_unlock(&qhp->lock);
+       spin_unlock_irqrestore(&rchp->lock, *flag);
+
+       /* locking heirarchy: cq lock first, then qp lock. */
+       spin_lock_irqsave(&schp->lock, *flag);
+       spin_lock(&qhp->lock);
+       cxio_flush_hw_cq(&schp->cq);
+       cxio_count_scqes(&schp->cq, &qhp->wq, &count);
+       cxio_flush_sq(&qhp->wq, &schp->cq, count);
+       spin_unlock(&qhp->lock);
+       spin_unlock_irqrestore(&schp->lock, *flag);
+
+       /* deref */
+       if (atomic_dec_and_test(&qhp->refcnt))
+               wake_up(&qhp->wait);
+
+       spin_lock_irqsave(&qhp->lock, *flag);
+}
+
+static inline void flush_qp(struct iwch_qp *qhp, unsigned long *flag)
+{
+       if (t3b_device(qhp->rhp))
+               cxio_set_wq_in_error(&qhp->wq);
+       else
+               __flush_qp(qhp, flag);
+}
+
+
+/*
+ * Return non zero if at least one RECV was pre-posted.
+ */
+static inline int rqes_posted(struct iwch_qp *qhp)
+{
+       return fw_riwrh_opcode((struct fw_riwrh *)qhp->wq.queue) == T3_WR_RCV;
+}
+
+static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp,
+                               enum iwch_qp_attr_mask mask,
+                               struct iwch_qp_attributes *attrs)
+{
+       struct t3_rdma_init_attr init_attr;
+       int ret;
+
+       init_attr.tid = qhp->ep->hwtid;
+       init_attr.qpid = qhp->wq.qpid;
+       init_attr.pdid = qhp->attr.pd;
+       init_attr.scqid = qhp->attr.scq;
+       init_attr.rcqid = qhp->attr.rcq;
+       init_attr.rq_addr = qhp->wq.rq_addr;
+       init_attr.rq_size = 1 << qhp->wq.rq_size_log2;
+       init_attr.mpaattrs = uP_RI_MPA_IETF_ENABLE |
+               qhp->attr.mpa_attr.recv_marker_enabled |
+               (qhp->attr.mpa_attr.xmit_marker_enabled << 1) |
+               (qhp->attr.mpa_attr.crc_enabled << 2);
+
+       /*
+        * XXX - The IWCM doesn't quite handle getting these
+        * attrs set before going into RTS.  For now, just turn
+        * them on always...
+        */
+#if 0
+       init_attr.qpcaps = qhp->attr.enableRdmaRead |
+               (qhp->attr.enableRdmaWrite << 1) |
+               (qhp->attr.enableBind << 2) |
+               (qhp->attr.enable_stag0_fastreg << 3) |
+               (qhp->attr.enable_stag0_fastreg << 4);
+#else
+       init_attr.qpcaps = 0x1f;
+#endif
+       init_attr.tcp_emss = qhp->ep->emss;
+       init_attr.ord = qhp->attr.max_ord;
+       init_attr.ird = qhp->attr.max_ird;
+       init_attr.qp_dma_addr = qhp->wq.dma_addr;
+       init_attr.qp_dma_size = (1UL << qhp->wq.size_log2);
+       init_attr.flags = rqes_posted(qhp) ? RECVS_POSTED : 0;
+       PDBG("%s init_attr.rq_addr 0x%x init_attr.rq_size = %d "
+            "flags 0x%x qpcaps 0x%x\n", __FUNCTION__,
+            init_attr.rq_addr, init_attr.rq_size,
+            init_attr.flags, init_attr.qpcaps);
+       ret = cxio_rdma_init(&rhp->rdev, &init_attr);
+       PDBG("%s ret %d\n", __FUNCTION__, ret);
+       return ret;
+}
+
+int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp,
+                               enum iwch_qp_attr_mask mask,
+                               struct iwch_qp_attributes *attrs,
+                               int internal)
+{
+       int ret = 0;
+       struct iwch_qp_attributes newattr = qhp->attr;
+       unsigned long flag;
+       int disconnect = 0;
+       int terminate = 0;
+       int abort = 0;
+       int free = 0;
+       struct iwch_ep *ep = NULL;
+
+       PDBG("%s qhp %p qpid 0x%x ep %p state %d -> %d\n", __FUNCTION__,
+            qhp, qhp->wq.qpid, qhp->ep, qhp->attr.state,
+            (mask & IWCH_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1);
+
+       spin_lock_irqsave(&qhp->lock, flag);
+
+       /* Process attr changes if in IDLE */
+       if (mask & IWCH_QP_ATTR_VALID_MODIFY) {
+               if (qhp->attr.state != IWCH_QP_STATE_IDLE) {
+                       ret = -EIO;
+                       goto out;
+               }
+               if (mask & IWCH_QP_ATTR_ENABLE_RDMA_READ)
+                       newattr.enable_rdma_read = attrs->enable_rdma_read;
+               if (mask & IWCH_QP_ATTR_ENABLE_RDMA_WRITE)
+                       newattr.enable_rdma_write = attrs->enable_rdma_write;
+               if (mask & IWCH_QP_ATTR_ENABLE_RDMA_BIND)
+                       newattr.enable_bind = attrs->enable_bind;
+               if (mask & IWCH_QP_ATTR_MAX_ORD) {
+                       if (attrs->max_ord >
+                           rhp->attr.max_rdma_read_qp_depth) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       newattr.max_ord = attrs->max_ord;
+               }
+               if (mask & IWCH_QP_ATTR_MAX_IRD) {
+                       if (attrs->max_ird >
+                           rhp->attr.max_rdma_reads_per_qp) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       newattr.max_ird = attrs->max_ird;
+               }
+               qhp->attr = newattr;
+       }
+
+       if (!(mask & IWCH_QP_ATTR_NEXT_STATE))
+               goto out;
+       if (qhp->attr.state == attrs->next_state)
+               goto out;
+
+       switch (qhp->attr.state) {
+       case IWCH_QP_STATE_IDLE:
+               switch (attrs->next_state) {
+               case IWCH_QP_STATE_RTS:
+                       if (!(mask & IWCH_QP_ATTR_LLP_STREAM_HANDLE)) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       if (!(mask & IWCH_QP_ATTR_MPA_ATTR)) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       qhp->attr.mpa_attr = attrs->mpa_attr;
+                       qhp->attr.llp_stream_handle = attrs->llp_stream_handle;
+                       qhp->ep = qhp->attr.llp_stream_handle;
+                       qhp->attr.state = IWCH_QP_STATE_RTS;
+
+                       /*
+                        * Ref the endpoint here and deref when we
+                        * disassociate the endpoint from the QP.  This
+                        * happens in CLOSING->IDLE transition or *->ERROR
+                        * transition.
+                        */
+                       get_ep(&qhp->ep->com);
+                       spin_unlock_irqrestore(&qhp->lock, flag);
+                       ret = rdma_init(rhp, qhp, mask, attrs);
+                       spin_lock_irqsave(&qhp->lock, flag);
+                       if (ret)
+                               goto err;
+                       break;
+               case IWCH_QP_STATE_ERROR:
+                       qhp->attr.state = IWCH_QP_STATE_ERROR;
+                       flush_qp(qhp, &flag);
+                       break;
+               default:
+                       ret = -EINVAL;
+                       goto out;
+               }
+               break;
+       case IWCH_QP_STATE_RTS:
+               switch (attrs->next_state) {
+               case IWCH_QP_STATE_CLOSING:
+                       BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2);
+                       qhp->attr.state = IWCH_QP_STATE_CLOSING;
+                       if (!internal) {
+                               abort=0;
+                               disconnect = 1;
+                               ep = qhp->ep;
+                       }
+                       break;
+               case IWCH_QP_STATE_TERMINATE:
+                       qhp->attr.state = IWCH_QP_STATE_TERMINATE;
+                       if (!internal)
+                               terminate = 1;
+                       break;
+               case IWCH_QP_STATE_ERROR:
+                       qhp->attr.state = IWCH_QP_STATE_ERROR;
+                       if (!internal) {
+                               abort=1;
+                               disconnect = 1;
+                               ep = qhp->ep;
+                       }
+                       goto err;
+                       break;
+               default:
+                       ret = -EINVAL;
+                       goto out;
+               }
+               break;
+       case IWCH_QP_STATE_CLOSING:
+               if (!internal) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               switch (attrs->next_state) {
+                       case IWCH_QP_STATE_IDLE:
+                               qhp->attr.state = IWCH_QP_STATE_IDLE;
+                               qhp->attr.llp_stream_handle = NULL;
+                               put_ep(&qhp->ep->com);
+                               qhp->ep = NULL;
+                               wake_up(&qhp->wait);
+                               break;
+                       case IWCH_QP_STATE_ERROR:
+                               goto err;
+                       default:
+                               ret = -EINVAL;
+                               goto err;
+               }
+               break;
+       case IWCH_QP_STATE_ERROR:
+               if (attrs->next_state != IWCH_QP_STATE_IDLE) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+
+               if (!Q_EMPTY(qhp->wq.sq_rptr, qhp->wq.sq_wptr) ||
+                   !Q_EMPTY(qhp->wq.rq_rptr, qhp->wq.rq_wptr)) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               qhp->attr.state = IWCH_QP_STATE_IDLE;
+               memset(&qhp->attr, 0, sizeof(qhp->attr));
+               break;
+       case IWCH_QP_STATE_TERMINATE:
+               if (!internal) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               goto err;
+               break;
+       default:
+               printk(KERN_ERR "%s in a bad state %d\n",
+                      __FUNCTION__, qhp->attr.state);
+               ret = -EINVAL;
+               goto err;
+               break;
+       }
+       goto out;
+err:
+       PDBG("%s disassociating ep %p qpid 0x%x\n", __FUNCTION__, qhp->ep,
+            qhp->wq.qpid);
+
+       /* disassociate the LLP connection */
+       qhp->attr.llp_stream_handle = NULL;
+       ep = qhp->ep;
+       qhp->ep = NULL;
+       qhp->attr.state = IWCH_QP_STATE_ERROR;
+       free=1;
+       wake_up(&qhp->wait);
+       BUG_ON(!ep);
+       flush_qp(qhp, &flag);
+out:
+       spin_unlock_irqrestore(&qhp->lock, flag);
+
+       if (terminate)
+               iwch_post_terminate(qhp, NULL);
+
+       /*
+        * If disconnect is 1, then we need to initiate a disconnect
+        * on the EP.  This can be a normal close (RTS->CLOSING) or
+        * an abnormal close (RTS/CLOSING->ERROR).
+        */
+       if (disconnect)
+               iwch_ep_disconnect(ep, abort, GFP_KERNEL);
+
+       /*
+        * If free is 1, then we've disassociated the EP from the QP
+        * and we need to dereference the EP.
+        */
+       if (free)
+               put_ep(&ep->com);
+
+       PDBG("%s exit state %d\n", __FUNCTION__, qhp->attr.state);
+       return ret;
+}
+
+static int quiesce_qp(struct iwch_qp *qhp)
+{
+       spin_lock_irq(&qhp->lock);
+       iwch_quiesce_tid(qhp->ep);
+       qhp->flags |= QP_QUIESCED;
+       spin_unlock_irq(&qhp->lock);
+       return 0;
+}
+
+static int resume_qp(struct iwch_qp *qhp)
+{
+       spin_lock_irq(&qhp->lock);
+       iwch_resume_tid(qhp->ep);
+       qhp->flags &= ~QP_QUIESCED;
+       spin_unlock_irq(&qhp->lock);
+       return 0;
+}
+
+int iwch_quiesce_qps(struct iwch_cq *chp)
+{
+       int i;
+       struct iwch_qp *qhp;
+
+       for (i=0; i < T3_MAX_NUM_QP; i++) {
+               qhp = get_qhp(chp->rhp, i);
+               if (!qhp)
+                       continue;
+               if ((qhp->attr.rcq == chp->cq.cqid) && !qp_quiesced(qhp)) {
+                       quiesce_qp(qhp);
+                       continue;
+               }
+               if ((qhp->attr.scq == chp->cq.cqid) && !qp_quiesced(qhp))
+                       quiesce_qp(qhp);
+       }
+       return 0;
+}
+
+int iwch_resume_qps(struct iwch_cq *chp)
+{
+       int i;
+       struct iwch_qp *qhp;
+
+       for (i=0; i < T3_MAX_NUM_QP; i++) {
+               qhp = get_qhp(chp->rhp, i);
+               if (!qhp)
+                       continue;
+               if ((qhp->attr.rcq == chp->cq.cqid) && qp_quiesced(qhp)) {
+                       resume_qp(qhp);
+                       continue;
+               }
+               if ((qhp->attr.scq == chp->cq.cqid) && qp_quiesced(qhp))
+                       resume_qp(qhp);
+       }
+       return 0;
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_user.h b/drivers/infiniband/hw/cxgb3/iwch_user.h
new file mode 100644 (file)
index 0000000..c4e7fbe
--- /dev/null
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2006 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IWCH_USER_H__
+#define __IWCH_USER_H__
+
+#define IWCH_UVERBS_ABI_VERSION        1
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+struct iwch_create_cq_req {
+       __u64 user_rptr_addr;
+};
+
+struct iwch_create_cq_resp {
+       __u64 key;
+       __u32 cqid;
+       __u32 size_log2;
+};
+
+struct iwch_create_qp_resp {
+       __u64 key;
+       __u64 db_key;
+       __u32 qpid;
+       __u32 size_log2;
+       __u32 sq_size_log2;
+       __u32 rq_size_log2;
+};
+
+struct iwch_reg_user_mr_resp {
+       __u32 pbl_addr;
+};
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/tcb.h b/drivers/infiniband/hw/cxgb3/tcb.h
new file mode 100644 (file)
index 0000000..c702dc1
--- /dev/null
@@ -0,0 +1,632 @@
+/*
+ * Copyright (c) 2007 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _TCB_DEFS_H
+#define _TCB_DEFS_H
+
+#define W_TCB_T_STATE    0
+#define S_TCB_T_STATE    0
+#define M_TCB_T_STATE    0xfULL
+#define V_TCB_T_STATE(x) ((x) << S_TCB_T_STATE)
+
+#define W_TCB_TIMER    0
+#define S_TCB_TIMER    4
+#define M_TCB_TIMER    0x1ULL
+#define V_TCB_TIMER(x) ((x) << S_TCB_TIMER)
+
+#define W_TCB_DACK_TIMER    0
+#define S_TCB_DACK_TIMER    5
+#define M_TCB_DACK_TIMER    0x1ULL
+#define V_TCB_DACK_TIMER(x) ((x) << S_TCB_DACK_TIMER)
+
+#define W_TCB_DEL_FLAG    0
+#define S_TCB_DEL_FLAG    6
+#define M_TCB_DEL_FLAG    0x1ULL
+#define V_TCB_DEL_FLAG(x) ((x) << S_TCB_DEL_FLAG)
+
+#define W_TCB_L2T_IX    0
+#define S_TCB_L2T_IX    7
+#define M_TCB_L2T_IX    0x7ffULL
+#define V_TCB_L2T_IX(x) ((x) << S_TCB_L2T_IX)
+
+#define W_TCB_SMAC_SEL    0
+#define S_TCB_SMAC_SEL    18
+#define M_TCB_SMAC_SEL    0x3ULL
+#define V_TCB_SMAC_SEL(x) ((x) << S_TCB_SMAC_SEL)
+
+#define W_TCB_TOS    0
+#define S_TCB_TOS    20
+#define M_TCB_TOS    0x3fULL
+#define V_TCB_TOS(x) ((x) << S_TCB_TOS)
+
+#define W_TCB_MAX_RT    0
+#define S_TCB_MAX_RT    26
+#define M_TCB_MAX_RT    0xfULL
+#define V_TCB_MAX_RT(x) ((x) << S_TCB_MAX_RT)
+
+#define W_TCB_T_RXTSHIFT    0
+#define S_TCB_T_RXTSHIFT    30
+#define M_TCB_T_RXTSHIFT    0xfULL
+#define V_TCB_T_RXTSHIFT(x) ((x) << S_TCB_T_RXTSHIFT)
+
+#define W_TCB_T_DUPACKS    1
+#define S_TCB_T_DUPACKS    2
+#define M_TCB_T_DUPACKS    0xfULL
+#define V_TCB_T_DUPACKS(x) ((x) << S_TCB_T_DUPACKS)
+
+#define W_TCB_T_MAXSEG    1
+#define S_TCB_T_MAXSEG    6
+#define M_TCB_T_MAXSEG    0xfULL
+#define V_TCB_T_MAXSEG(x) ((x) << S_TCB_T_MAXSEG)
+
+#define W_TCB_T_FLAGS1    1
+#define S_TCB_T_FLAGS1    10
+#define M_TCB_T_FLAGS1    0xffffffffULL
+#define V_TCB_T_FLAGS1(x) ((x) << S_TCB_T_FLAGS1)
+
+#define W_TCB_T_MIGRATION    1
+#define S_TCB_T_MIGRATION    20
+#define M_TCB_T_MIGRATION    0x1ULL
+#define V_TCB_T_MIGRATION(x) ((x) << S_TCB_T_MIGRATION)
+
+#define W_TCB_T_FLAGS2    2
+#define S_TCB_T_FLAGS2    10
+#define M_TCB_T_FLAGS2    0x7fULL
+#define V_TCB_T_FLAGS2(x) ((x) << S_TCB_T_FLAGS2)
+
+#define W_TCB_SND_SCALE    2
+#define S_TCB_SND_SCALE    17
+#define M_TCB_SND_SCALE    0xfULL
+#define V_TCB_SND_SCALE(x) ((x) << S_TCB_SND_SCALE)
+
+#define W_TCB_RCV_SCALE    2
+#define S_TCB_RCV_SCALE    21
+#define M_TCB_RCV_SCALE    0xfULL
+#define V_TCB_RCV_SCALE(x) ((x) << S_TCB_RCV_SCALE)
+
+#define W_TCB_SND_UNA_RAW    2
+#define S_TCB_SND_UNA_RAW    25
+#define M_TCB_SND_UNA_RAW    0x7ffffffULL
+#define V_TCB_SND_UNA_RAW(x) ((x) << S_TCB_SND_UNA_RAW)
+
+#define W_TCB_SND_NXT_RAW    3
+#define S_TCB_SND_NXT_RAW    20
+#define M_TCB_SND_NXT_RAW    0x7ffffffULL
+#define V_TCB_SND_NXT_RAW(x) ((x) << S_TCB_SND_NXT_RAW)
+
+#define W_TCB_RCV_NXT    4
+#define S_TCB_RCV_NXT    15
+#define M_TCB_RCV_NXT    0xffffffffULL
+#define V_TCB_RCV_NXT(x) ((x) << S_TCB_RCV_NXT)
+
+#define W_TCB_RCV_ADV    5
+#define S_TCB_RCV_ADV    15
+#define M_TCB_RCV_ADV    0xffffULL
+#define V_TCB_RCV_ADV(x) ((x) << S_TCB_RCV_ADV)
+
+#define W_TCB_SND_MAX_RAW    5
+#define S_TCB_SND_MAX_RAW    31
+#define M_TCB_SND_MAX_RAW    0x7ffffffULL
+#define V_TCB_SND_MAX_RAW(x) ((x) << S_TCB_SND_MAX_RAW)
+
+#define W_TCB_SND_CWND    6
+#define S_TCB_SND_CWND    26
+#define M_TCB_SND_CWND    0x7ffffffULL
+#define V_TCB_SND_CWND(x) ((x) << S_TCB_SND_CWND)
+
+#define W_TCB_SND_SSTHRESH    7
+#define S_TCB_SND_SSTHRESH    21
+#define M_TCB_SND_SSTHRESH    0x7ffffffULL
+#define V_TCB_SND_SSTHRESH(x) ((x) << S_TCB_SND_SSTHRESH)
+
+#define W_TCB_T_RTT_TS_RECENT_AGE    8
+#define S_TCB_T_RTT_TS_RECENT_AGE    16
+#define M_TCB_T_RTT_TS_RECENT_AGE    0xffffffffULL
+#define V_TCB_T_RTT_TS_RECENT_AGE(x) ((x) << S_TCB_T_RTT_TS_RECENT_AGE)
+
+#define W_TCB_T_RTSEQ_RECENT    9
+#define S_TCB_T_RTSEQ_RECENT    16
+#define M_TCB_T_RTSEQ_RECENT    0xffffffffULL
+#define V_TCB_T_RTSEQ_RECENT(x) ((x) << S_TCB_T_RTSEQ_RECENT)
+
+#define W_TCB_T_SRTT    10
+#define S_TCB_T_SRTT    16
+#define M_TCB_T_SRTT    0xffffULL
+#define V_TCB_T_SRTT(x) ((x) << S_TCB_T_SRTT)
+
+#define W_TCB_T_RTTVAR    11
+#define S_TCB_T_RTTVAR    0
+#define M_TCB_T_RTTVAR    0xffffULL
+#define V_TCB_T_RTTVAR(x) ((x) << S_TCB_T_RTTVAR)
+
+#define W_TCB_TS_LAST_ACK_SENT_RAW    11
+#define S_TCB_TS_LAST_ACK_SENT_RAW    16
+#define M_TCB_TS_LAST_ACK_SENT_RAW    0x7ffffffULL
+#define V_TCB_TS_LAST_ACK_SENT_RAW(x) ((x) << S_TCB_TS_LAST_ACK_SENT_RAW)
+
+#define W_TCB_DIP    12
+#define S_TCB_DIP    11
+#define M_TCB_DIP    0xffffffffULL
+#define V_TCB_DIP(x) ((x) << S_TCB_DIP)
+
+#define W_TCB_SIP    13
+#define S_TCB_SIP    11
+#define M_TCB_SIP    0xffffffffULL
+#define V_TCB_SIP(x) ((x) << S_TCB_SIP)
+
+#define W_TCB_DP    14
+#define S_TCB_DP    11
+#define M_TCB_DP    0xffffULL
+#define V_TCB_DP(x) ((x) << S_TCB_DP)
+
+#define W_TCB_SP    14
+#define S_TCB_SP    27
+#define M_TCB_SP    0xffffULL
+#define V_TCB_SP(x) ((x) << S_TCB_SP)
+
+#define W_TCB_TIMESTAMP    15
+#define S_TCB_TIMESTAMP    11
+#define M_TCB_TIMESTAMP    0xffffffffULL
+#define V_TCB_TIMESTAMP(x) ((x) << S_TCB_TIMESTAMP)
+
+#define W_TCB_TIMESTAMP_OFFSET    16
+#define S_TCB_TIMESTAMP_OFFSET    11
+#define M_TCB_TIMESTAMP_OFFSET    0xfULL
+#define V_TCB_TIMESTAMP_OFFSET(x) ((x) << S_TCB_TIMESTAMP_OFFSET)
+
+#define W_TCB_TX_MAX    16
+#define S_TCB_TX_MAX    15
+#define M_TCB_TX_MAX    0xffffffffULL
+#define V_TCB_TX_MAX(x) ((x) << S_TCB_TX_MAX)
+
+#define W_TCB_TX_HDR_PTR_RAW    17
+#define S_TCB_TX_HDR_PTR_RAW    15
+#define M_TCB_TX_HDR_PTR_RAW    0x1ffffULL
+#define V_TCB_TX_HDR_PTR_RAW(x) ((x) << S_TCB_TX_HDR_PTR_RAW)
+
+#define W_TCB_TX_LAST_PTR_RAW    18
+#define S_TCB_TX_LAST_PTR_RAW    0
+#define M_TCB_TX_LAST_PTR_RAW    0x1ffffULL
+#define V_TCB_TX_LAST_PTR_RAW(x) ((x) << S_TCB_TX_LAST_PTR_RAW)
+
+#define W_TCB_TX_COMPACT    18
+#define S_TCB_TX_COMPACT    17
+#define M_TCB_TX_COMPACT    0x1ULL
+#define V_TCB_TX_COMPACT(x) ((x) << S_TCB_TX_COMPACT)
+
+#define W_TCB_RX_COMPACT    18
+#define S_TCB_RX_COMPACT    18
+#define M_TCB_RX_COMPACT    0x1ULL
+#define V_TCB_RX_COMPACT(x) ((x) << S_TCB_RX_COMPACT)
+
+#define W_TCB_RCV_WND    18
+#define S_TCB_RCV_WND    19
+#define M_TCB_RCV_WND    0x7ffffffULL
+#define V_TCB_RCV_WND(x) ((x) << S_TCB_RCV_WND)
+
+#define W_TCB_RX_HDR_OFFSET    19
+#define S_TCB_RX_HDR_OFFSET    14
+#define M_TCB_RX_HDR_OFFSET    0x7ffffffULL
+#define V_TCB_RX_HDR_OFFSET(x) ((x) << S_TCB_RX_HDR_OFFSET)
+
+#define W_TCB_RX_FRAG0_START_IDX_RAW    20
+#define S_TCB_RX_FRAG0_START_IDX_RAW    9
+#define M_TCB_RX_FRAG0_START_IDX_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG0_START_IDX_RAW(x) ((x) << S_TCB_RX_FRAG0_START_IDX_RAW)
+
+#define W_TCB_RX_FRAG1_START_IDX_OFFSET    21
+#define S_TCB_RX_FRAG1_START_IDX_OFFSET    4
+#define M_TCB_RX_FRAG1_START_IDX_OFFSET    0x7ffffffULL
+#define V_TCB_RX_FRAG1_START_IDX_OFFSET(x) ((x) << S_TCB_RX_FRAG1_START_IDX_OFFSET)
+
+#define W_TCB_RX_FRAG0_LEN    21
+#define S_TCB_RX_FRAG0_LEN    31
+#define M_TCB_RX_FRAG0_LEN    0x7ffffffULL
+#define V_TCB_RX_FRAG0_LEN(x) ((x) << S_TCB_RX_FRAG0_LEN)
+
+#define W_TCB_RX_FRAG1_LEN    22
+#define S_TCB_RX_FRAG1_LEN    26
+#define M_TCB_RX_FRAG1_LEN    0x7ffffffULL
+#define V_TCB_RX_FRAG1_LEN(x) ((x) << S_TCB_RX_FRAG1_LEN)
+
+#define W_TCB_NEWRENO_RECOVER    23
+#define S_TCB_NEWRENO_RECOVER    21
+#define M_TCB_NEWRENO_RECOVER    0x7ffffffULL
+#define V_TCB_NEWRENO_RECOVER(x) ((x) << S_TCB_NEWRENO_RECOVER)
+
+#define W_TCB_PDU_HAVE_LEN    24
+#define S_TCB_PDU_HAVE_LEN    16
+#define M_TCB_PDU_HAVE_LEN    0x1ULL
+#define V_TCB_PDU_HAVE_LEN(x) ((x) << S_TCB_PDU_HAVE_LEN)
+
+#define W_TCB_PDU_LEN    24
+#define S_TCB_PDU_LEN    17
+#define M_TCB_PDU_LEN    0xffffULL
+#define V_TCB_PDU_LEN(x) ((x) << S_TCB_PDU_LEN)
+
+#define W_TCB_RX_QUIESCE    25
+#define S_TCB_RX_QUIESCE    1
+#define M_TCB_RX_QUIESCE    0x1ULL
+#define V_TCB_RX_QUIESCE(x) ((x) << S_TCB_RX_QUIESCE)
+
+#define W_TCB_RX_PTR_RAW    25
+#define S_TCB_RX_PTR_RAW    2
+#define M_TCB_RX_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_PTR_RAW(x) ((x) << S_TCB_RX_PTR_RAW)
+
+#define W_TCB_CPU_NO    25
+#define S_TCB_CPU_NO    19
+#define M_TCB_CPU_NO    0x7fULL
+#define V_TCB_CPU_NO(x) ((x) << S_TCB_CPU_NO)
+
+#define W_TCB_ULP_TYPE    25
+#define S_TCB_ULP_TYPE    26
+#define M_TCB_ULP_TYPE    0xfULL
+#define V_TCB_ULP_TYPE(x) ((x) << S_TCB_ULP_TYPE)
+
+#define W_TCB_RX_FRAG1_PTR_RAW    25
+#define S_TCB_RX_FRAG1_PTR_RAW    30
+#define M_TCB_RX_FRAG1_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_FRAG1_PTR_RAW(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW)
+
+#define W_TCB_RX_FRAG2_START_IDX_OFFSET_RAW    26
+#define S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW    15
+#define M_TCB_RX_FRAG2_START_IDX_OFFSET_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG2_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW)
+
+#define W_TCB_RX_FRAG2_PTR_RAW    27
+#define S_TCB_RX_FRAG2_PTR_RAW    10
+#define M_TCB_RX_FRAG2_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_FRAG2_PTR_RAW(x) ((x) << S_TCB_RX_FRAG2_PTR_RAW)
+
+#define W_TCB_RX_FRAG2_LEN_RAW    27
+#define S_TCB_RX_FRAG2_LEN_RAW    27
+#define M_TCB_RX_FRAG2_LEN_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG2_LEN_RAW(x) ((x) << S_TCB_RX_FRAG2_LEN_RAW)
+
+#define W_TCB_RX_FRAG3_PTR_RAW    28
+#define S_TCB_RX_FRAG3_PTR_RAW    22
+#define M_TCB_RX_FRAG3_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_FRAG3_PTR_RAW(x) ((x) << S_TCB_RX_FRAG3_PTR_RAW)
+
+#define W_TCB_RX_FRAG3_LEN_RAW    29
+#define S_TCB_RX_FRAG3_LEN_RAW    7
+#define M_TCB_RX_FRAG3_LEN_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG3_LEN_RAW(x) ((x) << S_TCB_RX_FRAG3_LEN_RAW)
+
+#define W_TCB_RX_FRAG3_START_IDX_OFFSET_RAW    30
+#define S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW    2
+#define M_TCB_RX_FRAG3_START_IDX_OFFSET_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG3_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW)
+
+#define W_TCB_PDU_HDR_LEN    30
+#define S_TCB_PDU_HDR_LEN    29
+#define M_TCB_PDU_HDR_LEN    0xffULL
+#define V_TCB_PDU_HDR_LEN(x) ((x) << S_TCB_PDU_HDR_LEN)
+
+#define W_TCB_SLUSH1    31
+#define S_TCB_SLUSH1    5
+#define M_TCB_SLUSH1    0x7ffffULL
+#define V_TCB_SLUSH1(x) ((x) << S_TCB_SLUSH1)
+
+#define W_TCB_ULP_RAW    31
+#define S_TCB_ULP_RAW    24
+#define M_TCB_ULP_RAW    0xffULL
+#define V_TCB_ULP_RAW(x) ((x) << S_TCB_ULP_RAW)
+
+#define W_TCB_DDP_RDMAP_VERSION    25
+#define S_TCB_DDP_RDMAP_VERSION    30
+#define M_TCB_DDP_RDMAP_VERSION    0x1ULL
+#define V_TCB_DDP_RDMAP_VERSION(x) ((x) << S_TCB_DDP_RDMAP_VERSION)
+
+#define W_TCB_MARKER_ENABLE_RX    25
+#define S_TCB_MARKER_ENABLE_RX    31
+#define M_TCB_MARKER_ENABLE_RX    0x1ULL
+#define V_TCB_MARKER_ENABLE_RX(x) ((x) << S_TCB_MARKER_ENABLE_RX)
+
+#define W_TCB_MARKER_ENABLE_TX    26
+#define S_TCB_MARKER_ENABLE_TX    0
+#define M_TCB_MARKER_ENABLE_TX    0x1ULL
+#define V_TCB_MARKER_ENABLE_TX(x) ((x) << S_TCB_MARKER_ENABLE_TX)
+
+#define W_TCB_CRC_ENABLE    26
+#define S_TCB_CRC_ENABLE    1
+#define M_TCB_CRC_ENABLE    0x1ULL
+#define V_TCB_CRC_ENABLE(x) ((x) << S_TCB_CRC_ENABLE)
+
+#define W_TCB_IRS_ULP    26
+#define S_TCB_IRS_ULP    2
+#define M_TCB_IRS_ULP    0x1ffULL
+#define V_TCB_IRS_ULP(x) ((x) << S_TCB_IRS_ULP)
+
+#define W_TCB_ISS_ULP    26
+#define S_TCB_ISS_ULP    11
+#define M_TCB_ISS_ULP    0x1ffULL
+#define V_TCB_ISS_ULP(x) ((x) << S_TCB_ISS_ULP)
+
+#define W_TCB_TX_PDU_LEN    26
+#define S_TCB_TX_PDU_LEN    20
+#define M_TCB_TX_PDU_LEN    0x3fffULL
+#define V_TCB_TX_PDU_LEN(x) ((x) << S_TCB_TX_PDU_LEN)
+
+#define W_TCB_TX_PDU_OUT    27
+#define S_TCB_TX_PDU_OUT    2
+#define M_TCB_TX_PDU_OUT    0x1ULL
+#define V_TCB_TX_PDU_OUT(x) ((x) << S_TCB_TX_PDU_OUT)
+
+#define W_TCB_CQ_IDX_SQ    27
+#define S_TCB_CQ_IDX_SQ    3
+#define M_TCB_CQ_IDX_SQ    0xffffULL
+#define V_TCB_CQ_IDX_SQ(x) ((x) << S_TCB_CQ_IDX_SQ)
+
+#define W_TCB_CQ_IDX_RQ    27
+#define S_TCB_CQ_IDX_RQ    19
+#define M_TCB_CQ_IDX_RQ    0xffffULL
+#define V_TCB_CQ_IDX_RQ(x) ((x) << S_TCB_CQ_IDX_RQ)
+
+#define W_TCB_QP_ID    28
+#define S_TCB_QP_ID    3
+#define M_TCB_QP_ID    0xffffULL
+#define V_TCB_QP_ID(x) ((x) << S_TCB_QP_ID)
+
+#define W_TCB_PD_ID    28
+#define S_TCB_PD_ID    19
+#define M_TCB_PD_ID    0xffffULL
+#define V_TCB_PD_ID(x) ((x) << S_TCB_PD_ID)
+
+#define W_TCB_STAG    29
+#define S_TCB_STAG    3
+#define M_TCB_STAG    0xffffffffULL
+#define V_TCB_STAG(x) ((x) << S_TCB_STAG)
+
+#define W_TCB_RQ_START    30
+#define S_TCB_RQ_START    3
+#define M_TCB_RQ_START    0x3ffffffULL
+#define V_TCB_RQ_START(x) ((x) << S_TCB_RQ_START)
+
+#define W_TCB_RQ_MSN    30
+#define S_TCB_RQ_MSN    29
+#define M_TCB_RQ_MSN    0x3ffULL
+#define V_TCB_RQ_MSN(x) ((x) << S_TCB_RQ_MSN)
+
+#define W_TCB_RQ_MAX_OFFSET    31
+#define S_TCB_RQ_MAX_OFFSET    7
+#define M_TCB_RQ_MAX_OFFSET    0xfULL
+#define V_TCB_RQ_MAX_OFFSET(x) ((x) << S_TCB_RQ_MAX_OFFSET)
+
+#define W_TCB_RQ_WRITE_PTR    31
+#define S_TCB_RQ_WRITE_PTR    11
+#define M_TCB_RQ_WRITE_PTR    0x3ffULL
+#define V_TCB_RQ_WRITE_PTR(x) ((x) << S_TCB_RQ_WRITE_PTR)
+
+#define W_TCB_INB_WRITE_PERM    31
+#define S_TCB_INB_WRITE_PERM    21
+#define M_TCB_INB_WRITE_PERM    0x1ULL
+#define V_TCB_INB_WRITE_PERM(x) ((x) << S_TCB_INB_WRITE_PERM)
+
+#define W_TCB_INB_READ_PERM    31
+#define S_TCB_INB_READ_PERM    22
+#define M_TCB_INB_READ_PERM    0x1ULL
+#define V_TCB_INB_READ_PERM(x) ((x) << S_TCB_INB_READ_PERM)
+
+#define W_TCB_ORD_L_BIT_VLD    31
+#define S_TCB_ORD_L_BIT_VLD    23
+#define M_TCB_ORD_L_BIT_VLD    0x1ULL
+#define V_TCB_ORD_L_BIT_VLD(x) ((x) << S_TCB_ORD_L_BIT_VLD)
+
+#define W_TCB_RDMAP_OPCODE    31
+#define S_TCB_RDMAP_OPCODE    24
+#define M_TCB_RDMAP_OPCODE    0xfULL
+#define V_TCB_RDMAP_OPCODE(x) ((x) << S_TCB_RDMAP_OPCODE)
+
+#define W_TCB_TX_FLUSH    31
+#define S_TCB_TX_FLUSH    28
+#define M_TCB_TX_FLUSH    0x1ULL
+#define V_TCB_TX_FLUSH(x) ((x) << S_TCB_TX_FLUSH)
+
+#define W_TCB_TX_OOS_RXMT    31
+#define S_TCB_TX_OOS_RXMT    29
+#define M_TCB_TX_OOS_RXMT    0x1ULL
+#define V_TCB_TX_OOS_RXMT(x) ((x) << S_TCB_TX_OOS_RXMT)
+
+#define W_TCB_TX_OOS_TXMT    31
+#define S_TCB_TX_OOS_TXMT    30
+#define M_TCB_TX_OOS_TXMT    0x1ULL
+#define V_TCB_TX_OOS_TXMT(x) ((x) << S_TCB_TX_OOS_TXMT)
+
+#define W_TCB_SLUSH_AUX2    31
+#define S_TCB_SLUSH_AUX2    31
+#define M_TCB_SLUSH_AUX2    0x1ULL
+#define V_TCB_SLUSH_AUX2(x) ((x) << S_TCB_SLUSH_AUX2)
+
+#define W_TCB_RX_FRAG1_PTR_RAW2    25
+#define S_TCB_RX_FRAG1_PTR_RAW2    30
+#define M_TCB_RX_FRAG1_PTR_RAW2    0x1ffffULL
+#define V_TCB_RX_FRAG1_PTR_RAW2(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW2)
+
+#define W_TCB_RX_DDP_FLAGS    26
+#define S_TCB_RX_DDP_FLAGS    15
+#define M_TCB_RX_DDP_FLAGS    0x3ffULL
+#define V_TCB_RX_DDP_FLAGS(x) ((x) << S_TCB_RX_DDP_FLAGS)
+
+#define W_TCB_SLUSH_AUX3    26
+#define S_TCB_SLUSH_AUX3    31
+#define M_TCB_SLUSH_AUX3    0x1ffULL
+#define V_TCB_SLUSH_AUX3(x) ((x) << S_TCB_SLUSH_AUX3)
+
+#define W_TCB_RX_DDP_BUF0_OFFSET    27
+#define S_TCB_RX_DDP_BUF0_OFFSET    8
+#define M_TCB_RX_DDP_BUF0_OFFSET    0x3fffffULL
+#define V_TCB_RX_DDP_BUF0_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF0_OFFSET)
+
+#define W_TCB_RX_DDP_BUF0_LEN    27
+#define S_TCB_RX_DDP_BUF0_LEN    30
+#define M_TCB_RX_DDP_BUF0_LEN    0x3fffffULL
+#define V_TCB_RX_DDP_BUF0_LEN(x) ((x) << S_TCB_RX_DDP_BUF0_LEN)
+
+#define W_TCB_RX_DDP_BUF1_OFFSET    28
+#define S_TCB_RX_DDP_BUF1_OFFSET    20
+#define M_TCB_RX_DDP_BUF1_OFFSET    0x3fffffULL
+#define V_TCB_RX_DDP_BUF1_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF1_OFFSET)
+
+#define W_TCB_RX_DDP_BUF1_LEN    29
+#define S_TCB_RX_DDP_BUF1_LEN    10
+#define M_TCB_RX_DDP_BUF1_LEN    0x3fffffULL
+#define V_TCB_RX_DDP_BUF1_LEN(x) ((x) << S_TCB_RX_DDP_BUF1_LEN)
+
+#define W_TCB_RX_DDP_BUF0_TAG    30
+#define S_TCB_RX_DDP_BUF0_TAG    0
+#define M_TCB_RX_DDP_BUF0_TAG    0xffffffffULL
+#define V_TCB_RX_DDP_BUF0_TAG(x) ((x) << S_TCB_RX_DDP_BUF0_TAG)
+
+#define W_TCB_RX_DDP_BUF1_TAG    31
+#define S_TCB_RX_DDP_BUF1_TAG    0
+#define M_TCB_RX_DDP_BUF1_TAG    0xffffffffULL
+#define V_TCB_RX_DDP_BUF1_TAG(x) ((x) << S_TCB_RX_DDP_BUF1_TAG)
+
+#define S_TF_DACK    10
+#define V_TF_DACK(x) ((x) << S_TF_DACK)
+
+#define S_TF_NAGLE    11
+#define V_TF_NAGLE(x) ((x) << S_TF_NAGLE)
+
+#define S_TF_RECV_SCALE    12
+#define V_TF_RECV_SCALE(x) ((x) << S_TF_RECV_SCALE)
+
+#define S_TF_RECV_TSTMP    13
+#define V_TF_RECV_TSTMP(x) ((x) << S_TF_RECV_TSTMP)
+
+#define S_TF_RECV_SACK    14
+#define V_TF_RECV_SACK(x) ((x) << S_TF_RECV_SACK)
+
+#define S_TF_TURBO    15
+#define V_TF_TURBO(x) ((x) << S_TF_TURBO)
+
+#define S_TF_KEEPALIVE    16
+#define V_TF_KEEPALIVE(x) ((x) << S_TF_KEEPALIVE)
+
+#define S_TF_TCAM_BYPASS    17
+#define V_TF_TCAM_BYPASS(x) ((x) << S_TF_TCAM_BYPASS)
+
+#define S_TF_CORE_FIN    18
+#define V_TF_CORE_FIN(x) ((x) << S_TF_CORE_FIN)
+
+#define S_TF_CORE_MORE    19
+#define V_TF_CORE_MORE(x) ((x) << S_TF_CORE_MORE)
+
+#define S_TF_MIGRATING    20
+#define V_TF_MIGRATING(x) ((x) << S_TF_MIGRATING)
+
+#define S_TF_ACTIVE_OPEN    21
+#define V_TF_ACTIVE_OPEN(x) ((x) << S_TF_ACTIVE_OPEN)
+
+#define S_TF_ASK_MODE    22
+#define V_TF_ASK_MODE(x) ((x) << S_TF_ASK_MODE)
+
+#define S_TF_NON_OFFLOAD    23
+#define V_TF_NON_OFFLOAD(x) ((x) << S_TF_NON_OFFLOAD)
+
+#define S_TF_MOD_SCHD    24
+#define V_TF_MOD_SCHD(x) ((x) << S_TF_MOD_SCHD)
+
+#define S_TF_MOD_SCHD_REASON0    25
+#define V_TF_MOD_SCHD_REASON0(x) ((x) << S_TF_MOD_SCHD_REASON0)
+
+#define S_TF_MOD_SCHD_REASON1    26
+#define V_TF_MOD_SCHD_REASON1(x) ((x) << S_TF_MOD_SCHD_REASON1)
+
+#define S_TF_MOD_SCHD_RX    27
+#define V_TF_MOD_SCHD_RX(x) ((x) << S_TF_MOD_SCHD_RX)
+
+#define S_TF_CORE_PUSH    28
+#define V_TF_CORE_PUSH(x) ((x) << S_TF_CORE_PUSH)
+
+#define S_TF_RCV_COALESCE_ENABLE    29
+#define V_TF_RCV_COALESCE_ENABLE(x) ((x) << S_TF_RCV_COALESCE_ENABLE)
+
+#define S_TF_RCV_COALESCE_PUSH    30
+#define V_TF_RCV_COALESCE_PUSH(x) ((x) << S_TF_RCV_COALESCE_PUSH)
+
+#define S_TF_RCV_COALESCE_LAST_PSH    31
+#define V_TF_RCV_COALESCE_LAST_PSH(x) ((x) << S_TF_RCV_COALESCE_LAST_PSH)
+
+#define S_TF_RCV_COALESCE_HEARTBEAT    32
+#define V_TF_RCV_COALESCE_HEARTBEAT(x) ((x) << S_TF_RCV_COALESCE_HEARTBEAT)
+
+#define S_TF_HALF_CLOSE    33
+#define V_TF_HALF_CLOSE(x) ((x) << S_TF_HALF_CLOSE)
+
+#define S_TF_DACK_MSS    34
+#define V_TF_DACK_MSS(x) ((x) << S_TF_DACK_MSS)
+
+#define S_TF_CCTRL_SEL0    35
+#define V_TF_CCTRL_SEL0(x) ((x) << S_TF_CCTRL_SEL0)
+
+#define S_TF_CCTRL_SEL1    36
+#define V_TF_CCTRL_SEL1(x) ((x) << S_TF_CCTRL_SEL1)
+
+#define S_TF_TCP_NEWRENO_FAST_RECOVERY    37
+#define V_TF_TCP_NEWRENO_FAST_RECOVERY(x) ((x) << S_TF_TCP_NEWRENO_FAST_RECOVERY)
+
+#define S_TF_TX_PACE_AUTO    38
+#define V_TF_TX_PACE_AUTO(x) ((x) << S_TF_TX_PACE_AUTO)
+
+#define S_TF_PEER_FIN_HELD    39
+#define V_TF_PEER_FIN_HELD(x) ((x) << S_TF_PEER_FIN_HELD)
+
+#define S_TF_CORE_URG    40
+#define V_TF_CORE_URG(x) ((x) << S_TF_CORE_URG)
+
+#define S_TF_RDMA_ERROR    41
+#define V_TF_RDMA_ERROR(x) ((x) << S_TF_RDMA_ERROR)
+
+#define S_TF_SSWS_DISABLED    42
+#define V_TF_SSWS_DISABLED(x) ((x) << S_TF_SSWS_DISABLED)
+
+#define S_TF_DUPACK_COUNT_ODD    43
+#define V_TF_DUPACK_COUNT_ODD(x) ((x) << S_TF_DUPACK_COUNT_ODD)
+
+#define S_TF_TX_CHANNEL    44
+#define V_TF_TX_CHANNEL(x) ((x) << S_TF_TX_CHANNEL)
+
+#define S_TF_RX_CHANNEL    45
+#define V_TF_RX_CHANNEL(x) ((x) << S_TF_RX_CHANNEL)
+
+#define S_TF_TX_PACE_FIXED    46
+#define V_TF_TX_PACE_FIXED(x) ((x) << S_TF_TX_PACE_FIXED)
+
+#define S_TF_RDMA_FLM_ERROR    47
+#define V_TF_RDMA_FLM_ERROR(x) ((x) << S_TF_RDMA_FLM_ERROR)
+
+#define S_TF_RX_FLOW_CONTROL_DISABLE    48
+#define V_TF_RX_FLOW_CONTROL_DISABLE(x) ((x) << S_TF_RX_FLOW_CONTROL_DISABLE)
+
+#endif /* _TCB_DEFS_H */
index c069be8cbcb291868b0419e8867e48d8b7acc6e2..6c4f9f91b15df3e10f9e417f3deac51abcfd4077 100644 (file)
@@ -756,6 +756,8 @@ void ehca_destroy_comp_pool(void)
                if (cpu_online(i))
                        destroy_comp_task(pool, i);
        }
+       free_percpu(pool->cpu_comp_tasks);
+       kfree(pool);
 #endif
 
        return;
index 968d1519761c50f64f28ed03e4f5ff8fd6d1e0b1..71314460b11e63b649c9ad55c423b5de3845ef4c 100644 (file)
@@ -1051,7 +1051,11 @@ int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
        MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EQ_OFFSET);
        dev_lim->max_eqs = 1 << (field & 0x7);
        MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET);
-       dev_lim->reserved_mtts = 1 << (field >> 4);
+       if (mthca_is_memfree(dev))
+               dev_lim->reserved_mtts = ALIGN((1 << (field >> 4)) * sizeof(u64),
+                                              MTHCA_MTT_SEG_SIZE) / MTHCA_MTT_SEG_SIZE;
+       else
+               dev_lim->reserved_mtts = 1 << (field >> 4);
        MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET);
        dev_lim->max_mrw_sz = 1 << field;
        MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MRW_OFFSET);
index fe5cecf70feddd6ec1b4c4d79dd32713d558126c..b7e42efaf43df30a5981503af8d80f3111a17038 100644 (file)
@@ -464,6 +464,8 @@ void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);
 int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd);
 void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd);
 
+int mthca_write_mtt_size(struct mthca_dev *dev);
+
 struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size);
 void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt);
 int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
index 44bc6cc734abf35351780ca37f724cc35d32bd11..0d9b7d06bbc23da3a5405bbf4acd28fabc964fa6 100644 (file)
@@ -379,7 +379,7 @@ static int mthca_load_fw(struct mthca_dev *mdev)
 
        mdev->fw.arbel.fw_icm =
                mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages,
-                               GFP_HIGHUSER | __GFP_NOWARN);
+                               GFP_HIGHUSER | __GFP_NOWARN, 0);
        if (!mdev->fw.arbel.fw_icm) {
                mthca_err(mdev, "Couldn't allocate FW area, aborting.\n");
                return -ENOMEM;
@@ -412,7 +412,7 @@ err_unmap_fa:
        mthca_UNMAP_FA(mdev, &status);
 
 err_free:
-       mthca_free_icm(mdev, mdev->fw.arbel.fw_icm);
+       mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
        return err;
 }
 
@@ -441,7 +441,7 @@ static int mthca_init_icm(struct mthca_dev *mdev,
                  (unsigned long long) aux_pages << 2);
 
        mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages,
-                                                GFP_HIGHUSER | __GFP_NOWARN);
+                                                GFP_HIGHUSER | __GFP_NOWARN, 0);
        if (!mdev->fw.arbel.aux_icm) {
                mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n");
                return -ENOMEM;
@@ -464,10 +464,15 @@ static int mthca_init_icm(struct mthca_dev *mdev,
                goto err_unmap_aux;
        }
 
+       /* CPU writes to non-reserved MTTs, while HCA might DMA to reserved mtts */
+       mdev->limits.reserved_mtts = ALIGN(mdev->limits.reserved_mtts * MTHCA_MTT_SEG_SIZE,
+                                          dma_get_cache_alignment()) / MTHCA_MTT_SEG_SIZE;
+
        mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base,
                                                         MTHCA_MTT_SEG_SIZE,
                                                         mdev->limits.num_mtt_segs,
-                                                        mdev->limits.reserved_mtts, 1);
+                                                        mdev->limits.reserved_mtts,
+                                                        1, 0);
        if (!mdev->mr_table.mtt_table) {
                mthca_err(mdev, "Failed to map MTT context memory, aborting.\n");
                err = -ENOMEM;
@@ -477,7 +482,8 @@ static int mthca_init_icm(struct mthca_dev *mdev,
        mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base,
                                                         dev_lim->mpt_entry_sz,
                                                         mdev->limits.num_mpts,
-                                                        mdev->limits.reserved_mrws, 1);
+                                                        mdev->limits.reserved_mrws,
+                                                        1, 1);
        if (!mdev->mr_table.mpt_table) {
                mthca_err(mdev, "Failed to map MPT context memory, aborting.\n");
                err = -ENOMEM;
@@ -487,7 +493,8 @@ static int mthca_init_icm(struct mthca_dev *mdev,
        mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base,
                                                        dev_lim->qpc_entry_sz,
                                                        mdev->limits.num_qps,
-                                                       mdev->limits.reserved_qps, 0);
+                                                       mdev->limits.reserved_qps,
+                                                       0, 0);
        if (!mdev->qp_table.qp_table) {
                mthca_err(mdev, "Failed to map QP context memory, aborting.\n");
                err = -ENOMEM;
@@ -497,7 +504,8 @@ static int mthca_init_icm(struct mthca_dev *mdev,
        mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base,
                                                         dev_lim->eqpc_entry_sz,
                                                         mdev->limits.num_qps,
-                                                        mdev->limits.reserved_qps, 0);
+                                                        mdev->limits.reserved_qps,
+                                                        0, 0);
        if (!mdev->qp_table.eqp_table) {
                mthca_err(mdev, "Failed to map EQP context memory, aborting.\n");
                err = -ENOMEM;
@@ -507,7 +515,7 @@ static int mthca_init_icm(struct mthca_dev *mdev,
        mdev->qp_table.rdb_table = mthca_alloc_icm_table(mdev, init_hca->rdb_base,
                                                         MTHCA_RDB_ENTRY_SIZE,
                                                         mdev->limits.num_qps <<
-                                                        mdev->qp_table.rdb_shift,
+                                                        mdev->qp_table.rdb_shift, 0,
                                                         0, 0);
        if (!mdev->qp_table.rdb_table) {
                mthca_err(mdev, "Failed to map RDB context memory, aborting\n");
@@ -518,7 +526,8 @@ static int mthca_init_icm(struct mthca_dev *mdev,
        mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base,
                                                    dev_lim->cqc_entry_sz,
                                                    mdev->limits.num_cqs,
-                                                   mdev->limits.reserved_cqs, 0);
+                                                   mdev->limits.reserved_cqs,
+                                                   0, 0);
        if (!mdev->cq_table.table) {
                mthca_err(mdev, "Failed to map CQ context memory, aborting.\n");
                err = -ENOMEM;
@@ -530,7 +539,8 @@ static int mthca_init_icm(struct mthca_dev *mdev,
                        mthca_alloc_icm_table(mdev, init_hca->srqc_base,
                                              dev_lim->srq_entry_sz,
                                              mdev->limits.num_srqs,
-                                             mdev->limits.reserved_srqs, 0);
+                                             mdev->limits.reserved_srqs,
+                                             0, 0);
                if (!mdev->srq_table.table) {
                        mthca_err(mdev, "Failed to map SRQ context memory, "
                                  "aborting.\n");
@@ -550,7 +560,7 @@ static int mthca_init_icm(struct mthca_dev *mdev,
                                                      mdev->limits.num_amgms,
                                                      mdev->limits.num_mgms +
                                                      mdev->limits.num_amgms,
-                                                     0);
+                                                     0, 0);
        if (!mdev->mcg_table.table) {
                mthca_err(mdev, "Failed to map MCG context memory, aborting.\n");
                err = -ENOMEM;
@@ -588,7 +598,7 @@ err_unmap_aux:
        mthca_UNMAP_ICM_AUX(mdev, &status);
 
 err_free_aux:
-       mthca_free_icm(mdev, mdev->fw.arbel.aux_icm);
+       mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0);
 
        return err;
 }
@@ -609,7 +619,7 @@ static void mthca_free_icms(struct mthca_dev *mdev)
        mthca_unmap_eq_icm(mdev);
 
        mthca_UNMAP_ICM_AUX(mdev, &status);
-       mthca_free_icm(mdev, mdev->fw.arbel.aux_icm);
+       mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0);
 }
 
 static int mthca_init_arbel(struct mthca_dev *mdev)
@@ -693,7 +703,7 @@ err_free_icm:
 
 err_stop_fw:
        mthca_UNMAP_FA(mdev, &status);
-       mthca_free_icm(mdev, mdev->fw.arbel.fw_icm);
+       mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
 
 err_disable:
        if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
@@ -712,7 +722,7 @@ static void mthca_close_hca(struct mthca_dev *mdev)
                mthca_free_icms(mdev);
 
                mthca_UNMAP_FA(mdev, &status);
-               mthca_free_icm(mdev, mdev->fw.arbel.fw_icm);
+               mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
 
                if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
                        mthca_DISABLE_LAM(mdev, &status);
index 6b19645d946c31af877c28d5c8d8c9ddfa4ec94f..0b9d053a599d2c3a328913aceccafb5721ed5698 100644 (file)
@@ -35,6 +35,9 @@
  */
 
 #include <linux/mm.h>
+#include <linux/scatterlist.h>
+
+#include <asm/page.h>
 
 #include "mthca_memfree.h"
 #include "mthca_dev.h"
@@ -58,22 +61,42 @@ struct mthca_user_db_table {
        }                page[0];
 };
 
-void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm)
+static void mthca_free_icm_pages(struct mthca_dev *dev, struct mthca_icm_chunk *chunk)
+{
+       int i;
+
+       if (chunk->nsg > 0)
+               pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+                            PCI_DMA_BIDIRECTIONAL);
+
+       for (i = 0; i < chunk->npages; ++i)
+               __free_pages(chunk->mem[i].page,
+                            get_order(chunk->mem[i].length));
+}
+
+static void mthca_free_icm_coherent(struct mthca_dev *dev, struct mthca_icm_chunk *chunk)
 {
-       struct mthca_icm_chunk *chunk, *tmp;
        int i;
 
+       for (i = 0; i < chunk->npages; ++i) {
+               dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
+                                 lowmem_page_address(chunk->mem[i].page),
+                                 sg_dma_address(&chunk->mem[i]));
+       }
+}
+
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent)
+{
+       struct mthca_icm_chunk *chunk, *tmp;
+
        if (!icm)
                return;
 
        list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
-               if (chunk->nsg > 0)
-                       pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
-                                    PCI_DMA_BIDIRECTIONAL);
-
-               for (i = 0; i < chunk->npages; ++i)
-                       __free_pages(chunk->mem[i].page,
-                                    get_order(chunk->mem[i].length));
+               if (coherent)
+                       mthca_free_icm_coherent(dev, chunk);
+               else
+                       mthca_free_icm_pages(dev, chunk);
 
                kfree(chunk);
        }
@@ -81,12 +104,41 @@ void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm)
        kfree(icm);
 }
 
+static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask)
+{
+       mem->page = alloc_pages(gfp_mask, order);
+       if (!mem->page)
+               return -ENOMEM;
+
+       mem->length = PAGE_SIZE << order;
+       mem->offset = 0;
+       return 0;
+}
+
+static int mthca_alloc_icm_coherent(struct device *dev, struct scatterlist *mem,
+                                   int order, gfp_t gfp_mask)
+{
+       void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order, &sg_dma_address(mem),
+                                      gfp_mask);
+       if (!buf)
+               return -ENOMEM;
+
+       sg_set_buf(mem, buf, PAGE_SIZE << order);
+       BUG_ON(mem->offset);
+       sg_dma_len(mem) = PAGE_SIZE << order;
+       return 0;
+}
+
 struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
-                                 gfp_t gfp_mask)
+                                 gfp_t gfp_mask, int coherent)
 {
        struct mthca_icm *icm;
        struct mthca_icm_chunk *chunk = NULL;
        int cur_order;
+       int ret;
+
+       /* We use sg_set_buf for coherent allocs, which assumes low memory */
+       BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
 
        icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
        if (!icm)
@@ -112,21 +164,28 @@ struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
                while (1 << cur_order > npages)
                        --cur_order;
 
-               chunk->mem[chunk->npages].page = alloc_pages(gfp_mask, cur_order);
-               if (chunk->mem[chunk->npages].page) {
-                       chunk->mem[chunk->npages].length = PAGE_SIZE << cur_order;
-                       chunk->mem[chunk->npages].offset = 0;
+               if (coherent)
+                       ret = mthca_alloc_icm_coherent(&dev->pdev->dev,
+                                                      &chunk->mem[chunk->npages],
+                                                      cur_order, gfp_mask);
+               else
+                       ret = mthca_alloc_icm_pages(&chunk->mem[chunk->npages],
+                                                   cur_order, gfp_mask);
 
-                       if (++chunk->npages == MTHCA_ICM_CHUNK_LEN) {
+               if (!ret) {
+                       ++chunk->npages;
+
+                       if (!coherent && chunk->npages == MTHCA_ICM_CHUNK_LEN) {
                                chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
                                                        chunk->npages,
                                                        PCI_DMA_BIDIRECTIONAL);
 
                                if (chunk->nsg <= 0)
                                        goto fail;
+                       }
 
+                       if (chunk->npages == MTHCA_ICM_CHUNK_LEN)
                                chunk = NULL;
-                       }
 
                        npages -= 1 << cur_order;
                } else {
@@ -136,7 +195,7 @@ struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
                }
        }
 
-       if (chunk) {
+       if (!coherent && chunk) {
                chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
                                        chunk->npages,
                                        PCI_DMA_BIDIRECTIONAL);
@@ -148,7 +207,7 @@ struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
        return icm;
 
 fail:
-       mthca_free_icm(dev, icm);
+       mthca_free_icm(dev, icm, coherent);
        return NULL;
 }
 
@@ -167,7 +226,7 @@ int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int ob
 
        table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
                                        (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
-                                       __GFP_NOWARN);
+                                       __GFP_NOWARN, table->coherent);
        if (!table->icm[i]) {
                ret = -ENOMEM;
                goto out;
@@ -175,7 +234,7 @@ int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int ob
 
        if (mthca_MAP_ICM(dev, table->icm[i], table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
                          &status) || status) {
-               mthca_free_icm(dev, table->icm[i]);
+               mthca_free_icm(dev, table->icm[i], table->coherent);
                table->icm[i] = NULL;
                ret = -ENOMEM;
                goto out;
@@ -204,16 +263,16 @@ void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int o
                mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
                                MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE,
                                &status);
-               mthca_free_icm(dev, table->icm[i]);
+               mthca_free_icm(dev, table->icm[i], table->coherent);
                table->icm[i] = NULL;
        }
 
        mutex_unlock(&table->mutex);
 }
 
-void *mthca_table_find(struct mthca_icm_table *table, int obj)
+void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle)
 {
-       int idx, offset, i;
+       int idx, offset, dma_offset, i;
        struct mthca_icm_chunk *chunk;
        struct mthca_icm *icm;
        struct page *page = NULL;
@@ -225,13 +284,22 @@ void *mthca_table_find(struct mthca_icm_table *table, int obj)
 
        idx = (obj & (table->num_obj - 1)) * table->obj_size;
        icm = table->icm[idx / MTHCA_TABLE_CHUNK_SIZE];
-       offset = idx % MTHCA_TABLE_CHUNK_SIZE;
+       dma_offset = offset = idx % MTHCA_TABLE_CHUNK_SIZE;
 
        if (!icm)
                goto out;
 
        list_for_each_entry(chunk, &icm->chunk_list, list) {
                for (i = 0; i < chunk->npages; ++i) {
+                       if (dma_handle && dma_offset >= 0) {
+                               if (sg_dma_len(&chunk->mem[i]) > dma_offset)
+                                       *dma_handle = sg_dma_address(&chunk->mem[i]) +
+                                               dma_offset;
+                               dma_offset -= sg_dma_len(&chunk->mem[i]);
+                       }
+                       /* DMA mapping can merge pages but not split them,
+                        * so if we found the page, dma_handle has already
+                        * been assigned to. */
                        if (chunk->mem[i].length > offset) {
                                page = chunk->mem[i].page;
                                goto out;
@@ -283,7 +351,7 @@ void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
 struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
                                              u64 virt, int obj_size,
                                              int nobj, int reserved,
-                                             int use_lowmem)
+                                             int use_lowmem, int use_coherent)
 {
        struct mthca_icm_table *table;
        int num_icm;
@@ -302,6 +370,7 @@ struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
        table->num_obj  = nobj;
        table->obj_size = obj_size;
        table->lowmem   = use_lowmem;
+       table->coherent = use_coherent;
        mutex_init(&table->mutex);
 
        for (i = 0; i < num_icm; ++i)
@@ -314,12 +383,12 @@ struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
 
                table->icm[i] = mthca_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
                                                (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
-                                               __GFP_NOWARN);
+                                               __GFP_NOWARN, use_coherent);
                if (!table->icm[i])
                        goto err;
                if (mthca_MAP_ICM(dev, table->icm[i], virt + i * MTHCA_TABLE_CHUNK_SIZE,
                                  &status) || status) {
-                       mthca_free_icm(dev, table->icm[i]);
+                       mthca_free_icm(dev, table->icm[i], table->coherent);
                        table->icm[i] = NULL;
                        goto err;
                }
@@ -339,7 +408,7 @@ err:
                        mthca_UNMAP_ICM(dev, virt + i * MTHCA_TABLE_CHUNK_SIZE,
                                        MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE,
                                        &status);
-                       mthca_free_icm(dev, table->icm[i]);
+                       mthca_free_icm(dev, table->icm[i], table->coherent);
                }
 
        kfree(table);
@@ -357,7 +426,7 @@ void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table)
                        mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
                                        MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE,
                                        &status);
-                       mthca_free_icm(dev, table->icm[i]);
+                       mthca_free_icm(dev, table->icm[i], table->coherent);
                }
 
        kfree(table);
index 6d42947e1dc448c35633a24d3280a25d58cc16da..594144145f45d8ae36c6e5324e14cb1c118b5cf8 100644 (file)
@@ -69,6 +69,7 @@ struct mthca_icm_table {
        int               num_obj;
        int               obj_size;
        int               lowmem;
+       int               coherent;
        struct mutex      mutex;
        struct mthca_icm *icm[0];
 };
@@ -82,17 +83,17 @@ struct mthca_icm_iter {
 struct mthca_dev;
 
 struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
-                                 gfp_t gfp_mask);
-void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm);
+                                 gfp_t gfp_mask, int coherent);
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent);
 
 struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
                                              u64 virt, int obj_size,
                                              int nobj, int reserved,
-                                             int use_lowmem);
+                                             int use_lowmem, int use_coherent);
 void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table);
 int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
 void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
-void *mthca_table_find(struct mthca_icm_table *table, int obj);
+void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle);
 int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table,
                          int start, int end);
 void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
index f71ffa88db3a1066906ffbf261c9273c80338b06..6037dd3f87dfa16e8a165a9c41adc17538e69b51 100644 (file)
@@ -243,8 +243,8 @@ void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt)
        kfree(mtt);
 }
 
-int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
-                   int start_index, u64 *buffer_list, int list_len)
+static int __mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+                            int start_index, u64 *buffer_list, int list_len)
 {
        struct mthca_mailbox *mailbox;
        __be64 *mtt_entry;
@@ -295,6 +295,84 @@ out:
        return err;
 }
 
+int mthca_write_mtt_size(struct mthca_dev *dev)
+{
+       if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy)
+               /*
+                * Be friendly to WRITE_MTT command
+                * and leave two empty slots for the
+                * index and reserved fields of the
+                * mailbox.
+                */
+               return PAGE_SIZE / sizeof (u64) - 2;
+
+       /* For Arbel, all MTTs must fit in the same page. */
+       return mthca_is_memfree(dev) ? (PAGE_SIZE / sizeof (u64)) : 0x7ffffff;
+}
+
+void mthca_tavor_write_mtt_seg(struct mthca_dev *dev, struct mthca_mtt *mtt,
+                             int start_index, u64 *buffer_list, int list_len)
+{
+       u64 __iomem *mtts;
+       int i;
+
+       mtts = dev->mr_table.tavor_fmr.mtt_base + mtt->first_seg * MTHCA_MTT_SEG_SIZE +
+               start_index * sizeof (u64);
+       for (i = 0; i < list_len; ++i)
+               mthca_write64_raw(cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT),
+                                 mtts + i);
+}
+
+void mthca_arbel_write_mtt_seg(struct mthca_dev *dev, struct mthca_mtt *mtt,
+                             int start_index, u64 *buffer_list, int list_len)
+{
+       __be64 *mtts;
+       dma_addr_t dma_handle;
+       int i;
+       int s = start_index * sizeof (u64);
+
+       /* For Arbel, all MTTs must fit in the same page. */
+       BUG_ON(s / PAGE_SIZE != (s + list_len * sizeof(u64) - 1) / PAGE_SIZE);
+       /* Require full segments */
+       BUG_ON(s % MTHCA_MTT_SEG_SIZE);
+
+       mtts = mthca_table_find(dev->mr_table.mtt_table, mtt->first_seg +
+                               s / MTHCA_MTT_SEG_SIZE, &dma_handle);
+
+       BUG_ON(!mtts);
+
+       for (i = 0; i < list_len; ++i)
+               mtts[i] = cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT);
+
+       dma_sync_single(&dev->pdev->dev, dma_handle, list_len * sizeof (u64), DMA_TO_DEVICE);
+}
+
+int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+                   int start_index, u64 *buffer_list, int list_len)
+{
+       int size = mthca_write_mtt_size(dev);
+       int chunk;
+
+       if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy)
+               return __mthca_write_mtt(dev, mtt, start_index, buffer_list, list_len);
+
+       while (list_len > 0) {
+               chunk = min(size, list_len);
+               if (mthca_is_memfree(dev))
+                       mthca_arbel_write_mtt_seg(dev, mtt, start_index,
+                                                 buffer_list, chunk);
+               else
+                       mthca_tavor_write_mtt_seg(dev, mtt, start_index,
+                                                 buffer_list, chunk);
+
+               list_len    -= chunk;
+               start_index += chunk;
+               buffer_list += chunk;
+       }
+
+       return 0;
+}
+
 static inline u32 tavor_hw_index_to_key(u32 ind)
 {
        return ind;
@@ -524,7 +602,7 @@ int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
                if (err)
                        goto err_out_mpt_free;
 
-               mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key);
+               mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key, NULL);
                BUG_ON(!mr->mem.arbel.mpt);
        } else
                mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base +
@@ -538,7 +616,8 @@ int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
 
        if (mthca_is_memfree(dev)) {
                mr->mem.arbel.mtts = mthca_table_find(dev->mr_table.mtt_table,
-                                                     mr->mtt->first_seg);
+                                                     mr->mtt->first_seg,
+                                                     &mr->mem.arbel.dma_handle);
                BUG_ON(!mr->mem.arbel.mtts);
        } else
                mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg;
@@ -712,6 +791,9 @@ int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
                fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] |
                                                     MTHCA_MTT_FLAG_PRESENT);
 
+       dma_sync_single(&dev->pdev->dev, fmr->mem.arbel.dma_handle,
+                       list_len * sizeof(u64), DMA_TO_DEVICE);
+
        fmr->mem.arbel.mpt->key    = cpu_to_be32(key);
        fmr->mem.arbel.mpt->lkey   = cpu_to_be32(key);
        fmr->mem.arbel.mpt->length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift));
@@ -761,7 +843,7 @@ void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
 int mthca_init_mr_table(struct mthca_dev *dev)
 {
        unsigned long addr;
-       int err, i;
+       int mpts, mtts, err, i;
 
        err = mthca_alloc_init(&dev->mr_table.mpt_alloc,
                               dev->limits.num_mpts,
@@ -795,13 +877,21 @@ int mthca_init_mr_table(struct mthca_dev *dev)
                        err = -EINVAL;
                        goto err_fmr_mpt;
                }
+               mpts = mtts = 1 << i;
+       } else {
+               mpts = dev->limits.num_mtt_segs;
+               mtts = dev->limits.num_mpts;
+       }
+
+       if (!mthca_is_memfree(dev) &&
+           (dev->mthca_flags & MTHCA_FLAG_FMR)) {
 
                addr = pci_resource_start(dev->pdev, 4) +
                        ((pci_resource_len(dev->pdev, 4) - 1) &
                         dev->mr_table.mpt_base);
 
                dev->mr_table.tavor_fmr.mpt_base =
-                       ioremap(addr, (1 << i) * sizeof(struct mthca_mpt_entry));
+                       ioremap(addr, mpts * sizeof(struct mthca_mpt_entry));
 
                if (!dev->mr_table.tavor_fmr.mpt_base) {
                        mthca_warn(dev, "MPT ioremap for FMR failed.\n");
@@ -814,19 +904,21 @@ int mthca_init_mr_table(struct mthca_dev *dev)
                         dev->mr_table.mtt_base);
 
                dev->mr_table.tavor_fmr.mtt_base =
-                       ioremap(addr, (1 << i) * MTHCA_MTT_SEG_SIZE);
+                       ioremap(addr, mtts * MTHCA_MTT_SEG_SIZE);
                if (!dev->mr_table.tavor_fmr.mtt_base) {
                        mthca_warn(dev, "MTT ioremap for FMR failed.\n");
                        err = -ENOMEM;
                        goto err_fmr_mtt;
                }
+       }
 
-               err = mthca_buddy_init(&dev->mr_table.tavor_fmr.mtt_buddy, i);
+       if (dev->limits.fmr_reserved_mtts) {
+               err = mthca_buddy_init(&dev->mr_table.tavor_fmr.mtt_buddy, fls(mtts - 1));
                if (err)
                        goto err_fmr_mtt_buddy;
 
                /* Prevent regular MRs from using FMR keys */
-               err = mthca_buddy_alloc(&dev->mr_table.mtt_buddy, i);
+               err = mthca_buddy_alloc(&dev->mr_table.mtt_buddy, fls(mtts - 1));
                if (err)
                        goto err_reserve_fmr;
 
index 58d44aa3c30297a6920c8b292a27cb458b1a7763..26bf86d1cfcd693cf29cdc19bd9704ec916b65de 100644 (file)
@@ -277,7 +277,7 @@ u64 mthca_make_profile(struct mthca_dev *dev,
         * out of the MR pool. They don't use additional memory, but
         * we assign them as part of the HCA profile anyway.
         */
-       if (mthca_is_memfree(dev))
+       if (mthca_is_memfree(dev) || BITS_PER_LONG == 64)
                dev->limits.fmr_reserved_mtts = 0;
        else
                dev->limits.fmr_reserved_mtts = request->fmr_reserved_mtts;
index 7b96751695eac20deb60c8c60e13eb2032a0763f..0725ad7ad9bf7ca43b31f96609311bb2defbfea9 100644 (file)
@@ -1015,6 +1015,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
        int shift, n, len;
        int i, j, k;
        int err = 0;
+       int write_mtt_size;
 
        shift = ffs(region->page_size) - 1;
 
@@ -1040,6 +1041,8 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
 
        i = n = 0;
 
+       write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages));
+
        list_for_each_entry(chunk, &region->chunk_list, list)
                for (j = 0; j < chunk->nmap; ++j) {
                        len = sg_dma_len(&chunk->page_list[j]) >> shift;
@@ -1047,14 +1050,11 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
                                pages[i++] = sg_dma_address(&chunk->page_list[j]) +
                                        region->page_size * k;
                                /*
-                                * Be friendly to WRITE_MTT command
-                                * and leave two empty slots for the
-                                * index and reserved fields of the
-                                * mailbox.
+                                * Be friendly to write_mtt and pass it chunks
+                                * of appropriate size.
                                 */
-                               if (i == PAGE_SIZE / sizeof (u64) - 2) {
-                                       err = mthca_write_mtt(dev, mr->mtt,
-                                                             n, pages, i);
+                               if (i == write_mtt_size) {
+                                       err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
                                        if (err)
                                                goto mtt_done;
                                        n += i;
index 9a5bece3fa5c28eaf9ffb1fda134cb8785d39cac..1d266ac2e094c404d2cd8d3167261c07e20c0f6d 100644 (file)
@@ -89,6 +89,7 @@ struct mthca_fmr {
                struct {
                        struct mthca_mpt_entry *mpt;
                        __be64 *mtts;
+                       dma_addr_t dma_handle;
                } arbel;
        } mem;
 };
index 5f5214c0337d18383ad6f2272b99fbd47f68a69b..224c93dd29eb7ecd9c56e8df0044aaa5c6609277 100644 (file)
@@ -399,7 +399,7 @@ static int to_ib_qp_access_flags(int mthca_flags)
 static void to_ib_ah_attr(struct mthca_dev *dev, struct ib_ah_attr *ib_ah_attr,
                                struct mthca_qp_path *path)
 {
-       memset(ib_ah_attr, 0, sizeof *path);
+       memset(ib_ah_attr, 0, sizeof *ib_ah_attr);
        ib_ah_attr->port_num      = (be32_to_cpu(path->port_pkey) >> 24) & 0x3;
 
        if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->limits.num_ports)
index 10684da33d5853e18e0cea25d9b219fbcd2b3b54..61974b0296ca132e26feacb70d10e556f2d706b4 100644 (file)
@@ -116,11 +116,16 @@ static void mthca_arbel_init_srq_context(struct mthca_dev *dev,
                                         struct mthca_srq *srq,
                                         struct mthca_arbel_srq_context *context)
 {
-       int logsize;
+       int logsize, max;
 
        memset(context, 0, sizeof *context);
 
-       logsize = ilog2(srq->max);
+       /*
+        * Put max in a temporary variable to work around gcc bug
+        * triggered by ilog2() on sparc64.
+        */
+       max = srq->max;
+       logsize = ilog2(max);
        context->state_logsize_srqn = cpu_to_be32(logsize << 24 | srq->srqn);
        context->lkey = cpu_to_be32(srq->mr.ibmr.lkey);
        context->db_index = cpu_to_be32(srq->db_index);
index c75322d820d42815cacd872e5574f1432abe695c..af78ccc4ce7108aba6968b5438a0d8adbc7be4f2 100644 (file)
@@ -1,6 +1,6 @@
 config INFINIBAND_IPOIB
        tristate "IP-over-InfiniBand"
-       depends on INFINIBAND && NETDEVICES && INET
+       depends on INFINIBAND && NETDEVICES && INET && (IPV6 || IPV6=n)
        ---help---
          Support for the IP-over-InfiniBand protocol (IPoIB). This
          transports IP packets over InfiniBand so you can use your IB
@@ -8,6 +8,20 @@ config INFINIBAND_IPOIB
 
          See Documentation/infiniband/ipoib.txt for more information
 
+config INFINIBAND_IPOIB_CM
+       bool "IP-over-InfiniBand Connected Mode support"
+       depends on INFINIBAND_IPOIB && EXPERIMENTAL
+       default n
+       ---help---
+         This option enables experimental support for IPoIB connected mode.
+         After enabling this option, you need to switch to connected mode through
+         /sys/class/net/ibXXX/mode to actually create connections, and then increase
+         the interface MTU with e.g. ifconfig ib0 mtu 65520.
+
+         WARNING: Enabling connected mode will trigger some
+         packet drops for multicast and UD mode traffic from this interface,
+         unless you limit mtu for these destinations to 2044.
+
 config INFINIBAND_IPOIB_DEBUG
        bool "IP-over-InfiniBand debugging" if EMBEDDED
        depends on INFINIBAND_IPOIB
index 8935e74ae3f8e9608ced0be86d2e3818db268dd8..98ee38e8c2c4bcc465149ecd3d550bd879ceac2a 100644 (file)
@@ -5,5 +5,6 @@ ib_ipoib-y                                      := ipoib_main.o \
                                                   ipoib_multicast.o \
                                                   ipoib_verbs.o \
                                                   ipoib_vlan.o
+ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM)         += ipoib_cm.o
 ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG)      += ipoib_fs.o
 
index 07deee8f81ce203e7be98161af36d6cb3a73a6b8..2594db2030b3abef14ac28b9373c3b41999c7c54 100644 (file)
@@ -62,6 +62,10 @@ enum {
 
        IPOIB_ENCAP_LEN           = 4,
 
+       IPOIB_CM_MTU              = 0x10000 - 0x10, /* padding to align header to 16 */
+       IPOIB_CM_BUF_SIZE         = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
+       IPOIB_CM_HEAD_SIZE        = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
+       IPOIB_CM_RX_SG            = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
        IPOIB_RX_RING_SIZE        = 128,
        IPOIB_TX_RING_SIZE        = 64,
        IPOIB_MAX_QUEUE_SIZE      = 8192,
@@ -81,6 +85,8 @@ enum {
        IPOIB_MCAST_RUN           = 6,
        IPOIB_STOP_REAPER         = 7,
        IPOIB_MCAST_STARTED       = 8,
+       IPOIB_FLAG_NETIF_STOPPED  = 9,
+       IPOIB_FLAG_ADMIN_CM       = 10,
 
        IPOIB_MAX_BACKOFF_SECONDS = 16,
 
@@ -90,6 +96,13 @@ enum {
        IPOIB_MCAST_FLAG_ATTACHED = 3,
 };
 
+#define        IPOIB_OP_RECV   (1ul << 31)
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+#define        IPOIB_CM_OP_SRQ (1ul << 30)
+#else
+#define        IPOIB_CM_OP_SRQ (0)
+#endif
+
 /* structs */
 
 struct ipoib_header {
@@ -113,6 +126,59 @@ struct ipoib_tx_buf {
        u64             mapping;
 };
 
+struct ib_cm_id;
+
+struct ipoib_cm_data {
+       __be32 qpn; /* High byte MUST be ignored on receive */
+       __be32 mtu;
+};
+
+struct ipoib_cm_rx {
+       struct ib_cm_id     *id;
+       struct ib_qp        *qp;
+       struct list_head     list;
+       struct net_device   *dev;
+       unsigned long        jiffies;
+};
+
+struct ipoib_cm_tx {
+       struct ib_cm_id     *id;
+       struct ib_cq        *cq;
+       struct ib_qp        *qp;
+       struct list_head     list;
+       struct net_device   *dev;
+       struct ipoib_neigh  *neigh;
+       struct ipoib_path   *path;
+       struct ipoib_tx_buf *tx_ring;
+       unsigned             tx_head;
+       unsigned             tx_tail;
+       unsigned long        flags;
+       u32                  mtu;
+       struct ib_wc         ibwc[IPOIB_NUM_WC];
+};
+
+struct ipoib_cm_rx_buf {
+       struct sk_buff *skb;
+       u64 mapping[IPOIB_CM_RX_SG];
+};
+
+struct ipoib_cm_dev_priv {
+       struct ib_srq          *srq;
+       struct ipoib_cm_rx_buf *srq_ring;
+       struct ib_cm_id        *id;
+       struct list_head        passive_ids;
+       struct work_struct      start_task;
+       struct work_struct      reap_task;
+       struct work_struct      skb_task;
+       struct delayed_work     stale_task;
+       struct sk_buff_head     skb_queue;
+       struct list_head        start_list;
+       struct list_head        reap_list;
+       struct ib_wc            ibwc[IPOIB_NUM_WC];
+       struct ib_sge           rx_sge[IPOIB_CM_RX_SG];
+       struct ib_recv_wr       rx_wr;
+};
+
 /*
  * Device private locking: tx_lock protects members used in TX fast
  * path (and we use LLTX so upper layers don't do extra locking).
@@ -179,6 +245,10 @@ struct ipoib_dev_priv {
        struct list_head child_intfs;
        struct list_head list;
 
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+       struct ipoib_cm_dev_priv cm;
+#endif
+
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
        struct list_head fs_list;
        struct dentry *mcg_dentry;
@@ -212,6 +282,9 @@ struct ipoib_path {
 
 struct ipoib_neigh {
        struct ipoib_ah    *ah;
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+       struct ipoib_cm_tx *cm;
+#endif
        union ib_gid        dgid;
        struct sk_buff_head queue;
 
@@ -315,6 +388,146 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
 void ipoib_pkey_poll(struct work_struct *work);
 int ipoib_pkey_dev_delay_open(struct net_device *dev);
 
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+
+#define IPOIB_FLAGS_RC          0x80
+#define IPOIB_FLAGS_UC          0x40
+
+/* We don't support UC connections at the moment */
+#define IPOIB_CM_SUPPORTED(ha)   (ha[0] & (IPOIB_FLAGS_RC))
+
+static inline int ipoib_cm_admin_enabled(struct net_device *dev)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       return IPOIB_CM_SUPPORTED(dev->dev_addr) &&
+               test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+}
+
+static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       return IPOIB_CM_SUPPORTED(n->ha) &&
+               test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+}
+
+static inline int ipoib_cm_up(struct ipoib_neigh *neigh)
+
+{
+       return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags);
+}
+
+static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
+{
+       return neigh->cm;
+}
+
+static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
+{
+       neigh->cm = tx;
+}
+
+void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx);
+int ipoib_cm_dev_open(struct net_device *dev);
+void ipoib_cm_dev_stop(struct net_device *dev);
+int ipoib_cm_dev_init(struct net_device *dev);
+int ipoib_cm_add_mode_attr(struct net_device *dev);
+void ipoib_cm_dev_cleanup(struct net_device *dev);
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
+                                   struct ipoib_neigh *neigh);
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
+void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
+                          unsigned int mtu);
+void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
+#else
+
+struct ipoib_cm_tx;
+
+static inline int ipoib_cm_admin_enabled(struct net_device *dev)
+{
+       return 0;
+}
+static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
+
+{
+       return 0;
+}
+
+static inline int ipoib_cm_up(struct ipoib_neigh *neigh)
+
+{
+       return 0;
+}
+
+static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
+{
+       return NULL;
+}
+
+static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
+{
+}
+
+static inline
+void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
+{
+       return;
+}
+
+static inline
+int ipoib_cm_dev_open(struct net_device *dev)
+{
+       return 0;
+}
+
+static inline
+void ipoib_cm_dev_stop(struct net_device *dev)
+{
+       return;
+}
+
+static inline
+int ipoib_cm_dev_init(struct net_device *dev)
+{
+       return -ENOSYS;
+}
+
+static inline
+void ipoib_cm_dev_cleanup(struct net_device *dev)
+{
+       return;
+}
+
+static inline
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
+                                   struct ipoib_neigh *neigh)
+{
+       return NULL;
+}
+
+static inline
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
+{
+       return;
+}
+
+static inline
+int ipoib_cm_add_mode_attr(struct net_device *dev)
+{
+       return 0;
+}
+
+static inline void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
+                                        unsigned int mtu)
+{
+       dev_kfree_skb_any(skb);
+}
+
+static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+}
+
+#endif
+
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 void ipoib_create_debug_files(struct net_device *dev);
 void ipoib_delete_debug_files(struct net_device *dev);
@@ -392,4 +605,6 @@ extern int ipoib_debug_level;
 
 #define IPOIB_GID_ARG(gid)     IPOIB_GID_RAW_ARG((gid).raw)
 
+#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
+
 #endif /* _IPOIB_H */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
new file mode 100644 (file)
index 0000000..2d48387
--- /dev/null
@@ -0,0 +1,1237 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies. All rights reserved
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * $Id$
+ */
+
+#include <rdma/ib_cm.h>
+#include <rdma/ib_cache.h>
+#include <net/dst.h>
+#include <net/icmp.h>
+#include <linux/icmpv6.h>
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+static int data_debug_level;
+
+module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
+MODULE_PARM_DESC(cm_data_debug_level,
+                "Enable data path debug tracing for connected mode if > 0");
+#endif
+
+#include "ipoib.h"
+
+#define IPOIB_CM_IETF_ID 0x1000000000000000ULL
+
+#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
+#define IPOIB_CM_RX_TIMEOUT     (2 * 256 * HZ)
+#define IPOIB_CM_RX_DELAY       (3 * 256 * HZ)
+#define IPOIB_CM_RX_UPDATE_MASK (0x3)
+
+struct ipoib_cm_id {
+       struct ib_cm_id *id;
+       int flags;
+       u32 remote_qpn;
+       u32 remote_mtu;
+};
+
+static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
+                              struct ib_cm_event *event);
+
+static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv,
+                                 u64 mapping[IPOIB_CM_RX_SG])
+{
+       int i;
+
+       ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
+
+       for (i = 0; i < IPOIB_CM_RX_SG - 1; ++i)
+               ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
+}
+
+static int ipoib_cm_post_receive(struct net_device *dev, int id)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       struct ib_recv_wr *bad_wr;
+       int i, ret;
+
+       priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ;
+
+       for (i = 0; i < IPOIB_CM_RX_SG; ++i)
+               priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
+
+       ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
+       if (unlikely(ret)) {
+               ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
+               ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[id].mapping);
+               dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
+               priv->cm.srq_ring[id].skb = NULL;
+       }
+
+       return ret;
+}
+
+static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id,
+                                u64 mapping[IPOIB_CM_RX_SG])
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       struct sk_buff *skb;
+       int i;
+
+       skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
+       if (unlikely(!skb))
+               return -ENOMEM;
+
+       /*
+        * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
+        * IP header to a multiple of 16.
+        */
+       skb_reserve(skb, 12);
+
+       mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
+                                      DMA_FROM_DEVICE);
+       if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
+               dev_kfree_skb_any(skb);
+               return -EIO;
+       }
+
+       for (i = 0; i < IPOIB_CM_RX_SG - 1; i++) {
+               struct page *page = alloc_page(GFP_ATOMIC);
+
+               if (!page)
+                       goto partial_error;
+               skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE);
+
+               mapping[i + 1] = ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[i].page,
+                                                0, PAGE_SIZE, DMA_TO_DEVICE);
+               if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1])))
+                       goto partial_error;
+       }
+
+       priv->cm.srq_ring[id].skb = skb;
+       return 0;
+
+partial_error:
+
+       ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
+
+       for (; i >= 0; --i)
+               ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
+
+       kfree_skb(skb);
+       return -ENOMEM;
+}
+
+static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
+                                          struct ipoib_cm_rx *p)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       struct ib_qp_init_attr attr = {
+               .send_cq = priv->cq, /* does not matter, we never send anything */
+               .recv_cq = priv->cq,
+               .srq = priv->cm.srq,
+               .cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */
+               .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
+               .sq_sig_type = IB_SIGNAL_ALL_WR,
+               .qp_type = IB_QPT_RC,
+               .qp_context = p,
+       };
+       return ib_create_qp(priv->pd, &attr);
+}
+
+static int ipoib_cm_modify_rx_qp(struct net_device *dev,
+                                 struct ib_cm_id *cm_id, struct ib_qp *qp,
+                                 unsigned psn)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       struct ib_qp_attr qp_attr;
+       int qp_attr_mask, ret;
+
+       qp_attr.qp_state = IB_QPS_INIT;
+       ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+       if (ret) {
+               ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
+               return ret;
+       }
+       ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+       if (ret) {
+               ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
+               return ret;
+       }
+       qp_attr.qp_state = IB_QPS_RTR;
+       ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+       if (ret) {
+               ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
+               return ret;
+       }
+       qp_attr.rq_psn = psn;
+       ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+       if (ret) {
+               ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
+               return ret;
+       }
+       return 0;
+}
+
+static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
+                            struct ib_qp *qp, struct ib_cm_req_event_param *req,
+                            unsigned psn)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       struct ipoib_cm_data data = {};
+       struct ib_cm_rep_param rep = {};
+
+       data.qpn = cpu_to_be32(priv->qp->qp_num);
+       data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
+
+       rep.private_data = &data;
+       rep.private_data_len = sizeof data;
+       rep.flow_control = 0;
+       rep.rnr_retry_count = req->rnr_retry_count;
+       rep.target_ack_delay = 20; /* FIXME */
+       rep.srq = 1;
+       rep.qp_num = qp->qp_num;
+       rep.starting_psn = psn;
+       return ib_send_cm_rep(cm_id, &rep);
+}
+
+static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+       struct net_device *dev = cm_id->context;
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       struct ipoib_cm_rx *p;
+       unsigned long flags;
+       unsigned psn;
+       int ret;
+
+       ipoib_dbg(priv, "REQ arrived\n");
+       p = kzalloc(sizeof *p, GFP_KERNEL);
+       if (!p)
+               return -ENOMEM;
+       p->dev = dev;
+       p->id = cm_id;
+       p->qp = ipoib_cm_create_rx_qp(dev, p);
+       if (IS_ERR(p->qp)) {
+               ret = PTR_ERR(p->qp);
+               goto err_qp;
+       }
+
+       psn = random32() & 0xffffff;
+       ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
+       if (ret)
+               goto err_modify;
+
+       ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn);
+       if (ret) {
+               ipoib_warn(priv, "failed to send REP: %d\n", ret);
+               goto err_rep;
+       }
+
+       cm_id->context = p;
+       p->jiffies = jiffies;
+       spin_lock_irqsave(&priv->lock, flags);
+       list_add(&p->list, &priv->cm.passive_ids);
+       spin_unlock_irqrestore(&priv->lock, flags);
+       queue_delayed_work(ipoib_workqueue,
+                          &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
+       return 0;
+
+err_rep:
+err_modify:
+       ib_destroy_qp(p->qp);
+err_qp:
+       kfree(p);
+       return ret;
+}
+
+static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
+                              struct ib_cm_event *event)
+{
+       struct ipoib_cm_rx *p;
+       struct ipoib_dev_priv *priv;
+       unsigned long flags;
+       int ret;
+
+       switch (event->event) {
+       case IB_CM_REQ_RECEIVED:
+               return ipoib_cm_req_handler(cm_id, event);
+       case IB_CM_DREQ_RECEIVED:
+               p = cm_id->context;
+               ib_send_cm_drep(cm_id, NULL, 0);
+               /* Fall through */
+       case IB_CM_REJ_RECEIVED:
+               p = cm_id->context;
+               priv = netdev_priv(p->dev);
+               spin_lock_irqsave(&priv->lock, flags);
+               if (list_empty(&p->list))
+                       ret = 0; /* Connection is going away already. */
+               else {
+                       list_del_init(&p->list);
+                       ret = -ECONNRESET;
+               }
+               spin_unlock_irqrestore(&priv->lock, flags);
+               if (ret) {
+                       ib_destroy_qp(p->qp);
+                       kfree(p);
+                       return ret;
+               }
+               return 0;
+       default:
+               return 0;
+       }
+}
+/* Adjust length of skb with fragments to match received data */
+static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
+                         unsigned int length)
+{
+       int i, num_frags;
+       unsigned int size;
+
+       /* put header into skb */
+       size = min(length, hdr_space);
+       skb->tail += size;
+       skb->len += size;
+       length -= size;
+
+       num_frags = skb_shinfo(skb)->nr_frags;
+       for (i = 0; i < num_frags; i++) {
+               skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+               if (length == 0) {
+                       /* don't need this page */
+                       __free_page(frag->page);
+                       --skb_shinfo(skb)->nr_frags;
+               } else {
+                       size = min(length, (unsigned) PAGE_SIZE);
+
+                       frag->size = size;
+                       skb->data_len += size;
+                       skb->truesize += size;
+                       skb->len += size;
+                       length -= size;
+               }
+       }
+}
+
+void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ;
+       struct sk_buff *skb;
+       struct ipoib_cm_rx *p;
+       unsigned long flags;
+       u64 mapping[IPOIB_CM_RX_SG];
+
+       ipoib_dbg_data(priv, "cm recv completion: id %d, op %d, status: %d\n",
+                      wr_id, wc->opcode, wc->status);
+
+       if (unlikely(wr_id >= ipoib_recvq_size)) {
+               ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
+                          wr_id, ipoib_recvq_size);
+               return;
+       }
+
+       skb  = priv->cm.srq_ring[wr_id].skb;
+
+       if (unlikely(wc->status != IB_WC_SUCCESS)) {
+               ipoib_dbg(priv, "cm recv error "
+                          "(status=%d, wrid=%d vend_err %x)\n",
+                          wc->status, wr_id, wc->vendor_err);
+               ++priv->stats.rx_dropped;
+               goto repost;
+       }
+
+       if (!likely(wr_id & IPOIB_CM_RX_UPDATE_MASK)) {
+               p = wc->qp->qp_context;
+               if (time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
+                       spin_lock_irqsave(&priv->lock, flags);
+                       p->jiffies = jiffies;
+                       /* Move this entry to list head, but do
+                        * not re-add it if it has been removed. */
+                       if (!list_empty(&p->list))
+                               list_move(&p->list, &priv->cm.passive_ids);
+                       spin_unlock_irqrestore(&priv->lock, flags);
+                       queue_delayed_work(ipoib_workqueue,
+                                          &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
+               }
+       }
+
+       if (unlikely(ipoib_cm_alloc_rx_skb(dev, wr_id, mapping))) {
+               /*
+                * If we can't allocate a new RX buffer, dump
+                * this packet and reuse the old buffer.
+                */
+               ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
+               ++priv->stats.rx_dropped;
+               goto repost;
+       }
+
+       ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[wr_id].mapping);
+       memcpy(priv->cm.srq_ring[wr_id].mapping, mapping, sizeof mapping);
+
+       ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+                      wc->byte_len, wc->slid);
+
+       skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len);
+
+       skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+       skb->mac.raw = skb->data;
+       skb_pull(skb, IPOIB_ENCAP_LEN);
+
+       dev->last_rx = jiffies;
+       ++priv->stats.rx_packets;
+       priv->stats.rx_bytes += skb->len;
+
+       skb->dev = dev;
+       /* XXX get correct PACKET_ type here */
+       skb->pkt_type = PACKET_HOST;
+       netif_rx_ni(skb);
+
+repost:
+       if (unlikely(ipoib_cm_post_receive(dev, wr_id)))
+               ipoib_warn(priv, "ipoib_cm_post_receive failed "
+                          "for buf %d\n", wr_id);
+}
+
+static inline int post_send(struct ipoib_dev_priv *priv,
+                           struct ipoib_cm_tx *tx,
+                           unsigned int wr_id,
+                           u64 addr, int len)
+{
+       struct ib_send_wr *bad_wr;
+
+       priv->tx_sge.addr             = addr;
+       priv->tx_sge.length           = len;
+
+       priv->tx_wr.wr_id             = wr_id;
+
+       return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
+}
+
+void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       struct ipoib_tx_buf *tx_req;
+       u64 addr;
+
+       if (unlikely(skb->len > tx->mtu)) {
+               ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+                          skb->len, tx->mtu);
+               ++priv->stats.tx_dropped;
+               ++priv->stats.tx_errors;
+               ipoib_cm_skb_too_long(dev, skb, tx->mtu - INFINIBAND_ALEN);
+               return;
+       }
+
+       ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
+                      tx->tx_head, skb->len, tx->qp->qp_num);
+
+       /*
+        * We put the skb into the tx_ring _before_ we call post_send()
+        * because it's entirely possible that the completion handler will
+        * run before we execute anything after the post_send().  That
+        * means we have to make sure everything is properly recorded and
+        * our state is consistent before we call post_send().
+        */
+       tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
+       tx_req->skb = skb;
+       addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
+       if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
+               ++priv->stats.tx_errors;
+               dev_kfree_skb_any(skb);
+               return;
+       }
+
+       tx_req->mapping = addr;
+
+       if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
+                               addr, skb->len))) {
+               ipoib_warn(priv, "post_send failed\n");
+               ++priv->stats.tx_errors;
+               ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
+               dev_kfree_skb_any(skb);
+       } else {
+               dev->trans_start = jiffies;
+               ++tx->tx_head;
+
+               if (tx->tx_head - tx->tx_tail == ipoib_sendq_size) {
+                       ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
+                                 tx->qp->qp_num);
+                       netif_stop_queue(dev);
+                       set_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags);
+               }
+       }
+}
+
+static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx,
+                                 struct ib_wc *wc)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       unsigned int wr_id = wc->wr_id;
+       struct ipoib_tx_buf *tx_req;
+       unsigned long flags;
+
+       ipoib_dbg_data(priv, "cm send completion: id %d, op %d, status: %d\n",
+                      wr_id, wc->opcode, wc->status);
+
+       if (unlikely(wr_id >= ipoib_sendq_size)) {
+               ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
+                          wr_id, ipoib_sendq_size);
+               return;
+       }
+
+       tx_req = &tx->tx_ring[wr_id];
+
+       ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
+
+       /* FIXME: is this right? Shouldn't we only increment on success? */
+       ++priv->stats.tx_packets;
+       priv->stats.tx_bytes += tx_req->skb->len;
+
+       dev_kfree_skb_any(tx_req->skb);
+
+       spin_lock_irqsave(&priv->tx_lock, flags);
+       ++tx->tx_tail;
+       if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) &&
+           tx->tx_head - tx->tx_tail <= ipoib_sendq_size >> 1) {
+               clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags);
+               netif_wake_queue(dev);
+       }
+
+       if (wc->status != IB_WC_SUCCESS &&
+           wc->status != IB_WC_WR_FLUSH_ERR) {
+               struct ipoib_neigh *neigh;
+
+               ipoib_dbg(priv, "failed cm send event "
+                          "(status=%d, wrid=%d vend_err %x)\n",
+                          wc->status, wr_id, wc->vendor_err);
+
+               spin_lock(&priv->lock);
+               neigh = tx->neigh;
+
+               if (neigh) {
+                       neigh->cm = NULL;
+                       list_del(&neigh->list);
+                       if (neigh->ah)
+                               ipoib_put_ah(neigh->ah);
+                       ipoib_neigh_free(dev, neigh);
+
+                       tx->neigh = NULL;
+               }
+
+               /* queue would be re-started anyway when TX is destroyed,
+                * but it makes sense to do it ASAP here. */
+               if (test_and_clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags))
+                       netif_wake_queue(dev);
+
+               if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+                       list_move(&tx->list, &priv->cm.reap_list);
+                       queue_work(ipoib_workqueue, &priv->cm.reap_task);
+               }
+
+               clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
+
+               spin_unlock(&priv->lock);
+       }
+
+       spin_unlock_irqrestore(&priv->tx_lock, flags);
+}
+
+static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr)
+{
+       struct ipoib_cm_tx *tx = tx_ptr;
+       int n, i;
+
+       ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+       do {
+               n = ib_poll_cq(cq, IPOIB_NUM_WC, tx->ibwc);
+               for (i = 0; i < n; ++i)
+                       ipoib_cm_handle_tx_wc(tx->dev, tx, tx->ibwc + i);
+       } while (n == IPOIB_NUM_WC);
+}
+
+int ipoib_cm_dev_open(struct net_device *dev)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       int ret;
+
+       if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
+               return 0;
+
+       priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
+       if (IS_ERR(priv->cm.id)) {
+               printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
+               return IS_ERR(priv->cm.id);
+       }
+
+       ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
+                          0, NULL);
+       if (ret) {
+               printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
+                      IPOIB_CM_IETF_ID | priv->qp->qp_num);
+               ib_destroy_cm_id(priv->cm.id);
+               return ret;
+       }
+       return 0;
+}
+
+void ipoib_cm_dev_stop(struct net_device *dev)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       struct ipoib_cm_rx *p;
+       unsigned long flags;
+
+       if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
+               return;
+
+       ib_destroy_cm_id(priv->cm.id);
+       spin_lock_irqsave(&priv->lock, flags);
+       while (!list_empty(&priv->cm.passive_ids)) {
+               p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
+               list_del_init(&p->list);
+               spin_unlock_irqrestore(&priv->lock, flags);
+               ib_destroy_cm_id(p->id);
+               ib_destroy_qp(p->qp);
+               kfree(p);
+               spin_lock_irqsave(&priv->lock, flags);
+       }
+       spin_unlock_irqrestore(&priv->lock, flags);
+
+       cancel_delayed_work(&priv->cm.stale_task);
+}
+
+static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+       struct ipoib_cm_tx *p = cm_id->context;
+       struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+       struct ipoib_cm_data *data = event->private_data;
+       struct sk_buff_head skqueue;
+       struct ib_qp_attr qp_attr;
+       int qp_attr_mask, ret;
+       struct sk_buff *skb;
+       unsigned long flags;
+
+       p->mtu = be32_to_cpu(data->mtu);
+
+       if (p->mtu < priv->dev->mtu + IPOIB_ENCAP_LEN) {
+               ipoib_warn(priv, "Rejecting connection: mtu %d < device mtu %d + 4\n",
+                          p->mtu, priv->dev->mtu);
+               return -EINVAL;
+       }
+
+       qp_attr.qp_state = IB_QPS_RTR;
+       ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+       if (ret) {
+               ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
+               return ret;
+       }
+
+       qp_attr.rq_psn = 0 /* FIXME */;
+       ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
+       if (ret) {
+               ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
+               return ret;
+       }
+
+       qp_attr.qp_state = IB_QPS_RTS;
+       ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+       if (ret) {
+               ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
+               return ret;
+       }
+       ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
+       if (ret) {
+               ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
+               return ret;
+       }
+
+       skb_queue_head_init(&skqueue);
+
+       spin_lock_irqsave(&priv->lock, flags);
+       set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
+       if (p->neigh)
+               while ((skb = __skb_dequeue(&p->neigh->queue)))
+                       __skb_queue_tail(&skqueue, skb);
+       spin_unlock_irqrestore(&priv->lock, flags);
+
+       while ((skb = __skb_dequeue(&skqueue))) {
+               skb->dev = p->dev;
+               if (dev_queue_xmit(skb))
+                       ipoib_warn(priv, "dev_queue_xmit failed "
+                                  "to requeue packet\n");
+       }
+
+       ret = ib_send_cm_rtu(cm_id, NULL, 0);
+       if (ret) {
+               ipoib_warn(priv, "failed to send RTU: %d\n", ret);
+               return ret;
+       }
+       return 0;
+}
+
+static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ib_cq *cq)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       struct ib_qp_init_attr attr = {};
+       attr.recv_cq = priv->cq;
+       attr.srq = priv->cm.srq;
+       attr.cap.max_send_wr = ipoib_sendq_size;
+       attr.cap.max_send_sge = 1;
+       attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+       attr.qp_type = IB_QPT_RC;
+       attr.send_cq = cq;
+       return ib_create_qp(priv->pd, &attr);
+}
+
+static int ipoib_cm_send_req(struct net_device *dev,
+                            struct ib_cm_id *id, struct ib_qp *qp,
+                            u32 qpn,
+                            struct ib_sa_path_rec *pathrec)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       struct ipoib_cm_data data = {};
+       struct ib_cm_req_param req = {};
+
+       data.qpn = cpu_to_be32(priv->qp->qp_num);
+       data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
+
+       req.primary_path              = pathrec;
+       req.alternate_path            = NULL;
+       req.service_id                = cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
+       req.qp_num                    = qp->qp_num;
+       req.qp_type                   = qp->qp_type;
+       req.private_data              = &data;
+       req.private_data_len          = sizeof data;
+       req.flow_control              = 0;
+
+       req.starting_psn              = 0; /* FIXME */
+
+       /*
+        * Pick some arbitrary defaults here; we could make these
+        * module parameters if anyone cared about setting them.
+        */
+       req.responder_resources       = 4;
+       req.remote_cm_response_timeout = 20;
+       req.local_cm_response_timeout  = 20;
+       req.retry_count               = 0; /* RFC draft warns against retries */
+       req.rnr_retry_count           = 0; /* RFC draft warns against retries */
+       req.max_cm_retries            = 15;
+       req.srq                       = 1;
+       return ib_send_cm_req(id, &req);
+}
+
+static int ipoib_cm_modify_tx_init(struct net_device *dev,
+                                 struct ib_cm_id *cm_id, struct ib_qp *qp)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       struct ib_qp_attr qp_attr;
+       int qp_attr_mask, ret;
+       ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
+       if (ret) {
+               ipoib_warn(priv, "pkey 0x%x not in cache: %d\n", priv->pkey, ret);
+               return ret;
+       }
+
+       qp_attr.qp_state = IB_QPS_INIT;
+       qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
+       qp_attr.port_num = priv->port;
+       qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+       ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+       if (ret) {
+               ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
+               return ret;
+       }
+       return 0;
+}
+
+static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
+                           struct ib_sa_path_rec *pathrec)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+       int ret;
+
+       p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring,
+                               GFP_KERNEL);
+       if (!p->tx_ring) {
+               ipoib_warn(priv, "failed to allocate tx ring\n");
+               ret = -ENOMEM;
+               goto err_tx;
+       }
+
+       p->cq = ib_create_cq(priv->ca, ipoib_cm_tx_completion, NULL, p,
+                            ipoib_sendq_size + 1);
+       if (IS_ERR(p->cq)) {
+               ret = PTR_ERR(p->cq);
+               ipoib_warn(priv, "failed to allocate tx cq: %d\n", ret);
+               goto err_cq;
+       }
+
+       ret = ib_req_notify_cq(p->cq, IB_CQ_NEXT_COMP);
+       if (ret) {
+               ipoib_warn(priv, "failed to request completion notification: %d\n", ret);
+               goto err_req_notify;
+       }
+
+       p->qp = ipoib_cm_create_tx_qp(p->dev, p->cq);
+       if (IS_ERR(p->qp)) {
+               ret = PTR_ERR(p->qp);
+               ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
+               goto err_qp;
+       }
+
+       p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
+       if (IS_ERR(p->id)) {
+               ret = PTR_ERR(p->id);
+               ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
+               goto err_id;
+       }
+
+       ret = ipoib_cm_modify_tx_init(p->dev, p->id,  p->qp);
+       if (ret) {
+               ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
+               goto err_modify;
+       }
+
+       ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec);
+       if (ret) {
+               ipoib_warn(priv, "failed to send cm req: %d\n", ret);
+               goto err_send_cm;
+       }
+
+       ipoib_dbg(priv, "Request connection 0x%x for gid " IPOIB_GID_FMT " qpn 0x%x\n",
+                 p->qp->qp_num, IPOIB_GID_ARG(pathrec->dgid), qpn);
+
+       return 0;
+
+err_send_cm:
+err_modify:
+       ib_destroy_cm_id(p->id);
+err_id:
+       p->id = NULL;
+       ib_destroy_qp(p->qp);
+err_req_notify:
+err_qp:
+       p->qp = NULL;
+       ib_destroy_cq(p->cq);
+err_cq:
+       p->cq = NULL;
+err_tx:
+       return ret;
+}
+
+static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+       struct ipoib_tx_buf *tx_req;
+
+       ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
+                 p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
+
+       if (p->id)
+               ib_destroy_cm_id(p->id);
+
+       if (p->qp)
+               ib_destroy_qp(p->qp);
+
+       if (p->cq)
+               ib_destroy_cq(p->cq);
+
+       if (test_bit(IPOIB_FLAG_NETIF_STOPPED, &p->flags))
+               netif_wake_queue(p->dev);
+
+       if (p->tx_ring) {
+               while ((int) p->tx_tail - (int) p->tx_head < 0) {
+                       tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
+                       ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
+                                        DMA_TO_DEVICE);
+                       dev_kfree_skb_any(tx_req->skb);
+                       ++p->tx_tail;
+               }
+
+               kfree(p->tx_ring);
+       }
+
+       kfree(p);
+}
+
+static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
+                              struct ib_cm_event *event)
+{
+       struct ipoib_cm_tx *tx = cm_id->context;
+       struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
+       struct net_device *dev = priv->dev;
+       struct ipoib_neigh *neigh;
+       unsigned long flags;
+       int ret;
+
+       switch (event->event) {
+       case IB_CM_DREQ_RECEIVED:
+               ipoib_dbg(priv, "DREQ received.\n");
+               ib_send_cm_drep(cm_id, NULL, 0);
+               break;
+       case IB_CM_REP_RECEIVED:
+               ipoib_dbg(priv, "REP received.\n");
+               ret = ipoib_cm_rep_handler(cm_id, event);
+               if (ret)
+                       ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+                                      NULL, 0, NULL, 0);
+               break;
+       case IB_CM_REQ_ERROR:
+       case IB_CM_REJ_RECEIVED:
+       case IB_CM_TIMEWAIT_EXIT:
+               ipoib_dbg(priv, "CM error %d.\n", event->event);
+               spin_lock_irqsave(&priv->tx_lock, flags);
+               spin_lock(&priv->lock);
+               neigh = tx->neigh;
+
+               if (neigh) {
+                       neigh->cm = NULL;
+                       list_del(&neigh->list);
+                       if (neigh->ah)
+                               ipoib_put_ah(neigh->ah);
+                       ipoib_neigh_free(dev, neigh);
+
+                       tx->neigh = NULL;
+               }
+
+               if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+                       list_move(&tx->list, &priv->cm.reap_list);
+                       queue_work(ipoib_workqueue, &priv->cm.reap_task);
+               }
+
+               spin_unlock(&priv->lock);
+               spin_unlock_irqrestore(&priv->tx_lock, flags);
+               break;
+       default:
+               break;
+       }
+
+       return 0;
+}
+
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
+                                      struct ipoib_neigh *neigh)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       struct ipoib_cm_tx *tx;
+
+       tx = kzalloc(sizeof *tx, GFP_ATOMIC);
+       if (!tx)
+               return NULL;
+
+       neigh->cm = tx;
+       tx->neigh = neigh;
+       tx->path = path;
+       tx->dev = dev;
+       list_add(&tx->list, &priv->cm.start_list);
+       set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
+       queue_work(ipoib_workqueue, &priv->cm.start_task);
+       return tx;
+}
+
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
+       if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+               list_move(&tx->list, &priv->cm.reap_list);
+               queue_work(ipoib_workqueue, &priv->cm.reap_task);
+               ipoib_dbg(priv, "Reap connection for gid " IPOIB_GID_FMT "\n",
+                         IPOIB_GID_ARG(tx->neigh->dgid));
+               tx->neigh = NULL;
+       }
+}
+
+static void ipoib_cm_tx_start(struct work_struct *work)
+{
+       struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+                                                  cm.start_task);
+       struct net_device *dev = priv->dev;
+       struct ipoib_neigh *neigh;
+       struct ipoib_cm_tx *p;
+       unsigned long flags;
+       int ret;
+
+       struct ib_sa_path_rec pathrec;
+       u32 qpn;
+
+       spin_lock_irqsave(&priv->tx_lock, flags);
+       spin_lock(&priv->lock);
+       while (!list_empty(&priv->cm.start_list)) {
+               p = list_entry(priv->cm.start_list.next, typeof(*p), list);
+               list_del_init(&p->list);
+               neigh = p->neigh;
+               qpn = IPOIB_QPN(neigh->neighbour->ha);
+               memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
+               spin_unlock(&priv->lock);
+               spin_unlock_irqrestore(&priv->tx_lock, flags);
+               ret = ipoib_cm_tx_init(p, qpn, &pathrec);
+               spin_lock_irqsave(&priv->tx_lock, flags);
+               spin_lock(&priv->lock);
+               if (ret) {
+                       neigh = p->neigh;
+                       if (neigh) {
+                               neigh->cm = NULL;
+                               list_del(&neigh->list);
+                               if (neigh->ah)
+                                       ipoib_put_ah(neigh->ah);
+                               ipoib_neigh_free(dev, neigh);
+                       }
+                       list_del(&p->list);
+                       kfree(p);
+               }
+       }
+       spin_unlock(&priv->lock);
+       spin_unlock_irqrestore(&priv->tx_lock, flags);
+}
+
+static void ipoib_cm_tx_reap(struct work_struct *work)
+{
+       struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+                                                  cm.reap_task);
+       struct ipoib_cm_tx *p;
+       unsigned long flags;
+
+       spin_lock_irqsave(&priv->tx_lock, flags);
+       spin_lock(&priv->lock);
+       while (!list_empty(&priv->cm.reap_list)) {
+               p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
+               list_del(&p->list);
+               spin_unlock(&priv->lock);
+               spin_unlock_irqrestore(&priv->tx_lock, flags);
+               ipoib_cm_tx_destroy(p);
+               spin_lock_irqsave(&priv->tx_lock, flags);
+               spin_lock(&priv->lock);
+       }
+       spin_unlock(&priv->lock);
+       spin_unlock_irqrestore(&priv->tx_lock, flags);
+}
+
+static void ipoib_cm_skb_reap(struct work_struct *work)
+{
+       struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+                                                  cm.skb_task);
+       struct net_device *dev = priv->dev;
+       struct sk_buff *skb;
+       unsigned long flags;
+
+       unsigned mtu = priv->mcast_mtu;
+
+       spin_lock_irqsave(&priv->tx_lock, flags);
+       spin_lock(&priv->lock);
+       while ((skb = skb_dequeue(&priv->cm.skb_queue))) {
+               spin_unlock(&priv->lock);
+               spin_unlock_irqrestore(&priv->tx_lock, flags);
+               if (skb->protocol == htons(ETH_P_IP))
+                       icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+               else if (skb->protocol == htons(ETH_P_IPV6))
+                       icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
+#endif
+               dev_kfree_skb_any(skb);
+               spin_lock_irqsave(&priv->tx_lock, flags);
+               spin_lock(&priv->lock);
+       }
+       spin_unlock(&priv->lock);
+       spin_unlock_irqrestore(&priv->tx_lock, flags);
+}
+
+void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
+                          unsigned int mtu)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       int e = skb_queue_empty(&priv->cm.skb_queue);
+
+       if (skb->dst)
+               skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+       skb_queue_tail(&priv->cm.skb_queue, skb);
+       if (e)
+               queue_work(ipoib_workqueue, &priv->cm.skb_task);
+}
+
+static void ipoib_cm_stale_task(struct work_struct *work)
+{
+       struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+                                                  cm.stale_task.work);
+       struct ipoib_cm_rx *p;
+       unsigned long flags;
+
+       spin_lock_irqsave(&priv->lock, flags);
+       while (!list_empty(&priv->cm.passive_ids)) {
+               /* List if sorted by LRU, start from tail,
+                * stop when we see a recently used entry */
+               p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
+               if (time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
+                       break;
+               list_del_init(&p->list);
+               spin_unlock_irqrestore(&priv->lock, flags);
+               ib_destroy_cm_id(p->id);
+               ib_destroy_qp(p->qp);
+               kfree(p);
+               spin_lock_irqsave(&priv->lock, flags);
+       }
+       spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+
+static ssize_t show_mode(struct device *d, struct device_attribute *attr, 
+                        char *buf)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d));
+
+       if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
+               return sprintf(buf, "connected\n");
+       else
+               return sprintf(buf, "datagram\n");
+}
+
+static ssize_t set_mode(struct device *d, struct device_attribute *attr,
+                       const char *buf, size_t count)
+{
+       struct net_device *dev = to_net_dev(d);
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+       /* flush paths if we switch modes so that connections are restarted */
+       if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
+               set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+               ipoib_warn(priv, "enabling connected mode "
+                          "will cause multicast packet drops\n");
+               ipoib_flush_paths(dev);
+               return count;
+       }
+
+       if (!strcmp(buf, "datagram\n")) {
+               clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+               dev->mtu = min(priv->mcast_mtu, dev->mtu);
+               ipoib_flush_paths(dev);
+               return count;
+       }
+
+       return -EINVAL;
+}
+
+static DEVICE_ATTR(mode, S_IWUGO | S_IRUGO, show_mode, set_mode);
+
+int ipoib_cm_add_mode_attr(struct net_device *dev)
+{
+       return device_create_file(&dev->dev, &dev_attr_mode);
+}
+
+int ipoib_cm_dev_init(struct net_device *dev)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       struct ib_srq_init_attr srq_init_attr = {
+               .attr = {
+                       .max_wr  = ipoib_recvq_size,
+                       .max_sge = IPOIB_CM_RX_SG
+               }
+       };
+       int ret, i;
+
+       INIT_LIST_HEAD(&priv->cm.passive_ids);
+       INIT_LIST_HEAD(&priv->cm.reap_list);
+       INIT_LIST_HEAD(&priv->cm.start_list);
+       INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
+       INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
+       INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
+       INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
+
+       skb_queue_head_init(&priv->cm.skb_queue);
+
+       priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
+       if (IS_ERR(priv->cm.srq)) {
+               ret = PTR_ERR(priv->cm.srq);
+               priv->cm.srq = NULL;
+               return ret;
+       }
+
+       priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring,
+                                   GFP_KERNEL);
+       if (!priv->cm.srq_ring) {
+               printk(KERN_WARNING "%s: failed to allocate CM ring (%d entries)\n",
+                      priv->ca->name, ipoib_recvq_size);
+               ipoib_cm_dev_cleanup(dev);
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < IPOIB_CM_RX_SG; ++i)
+               priv->cm.rx_sge[i].lkey = priv->mr->lkey;
+
+       priv->cm.rx_sge[0].length = IPOIB_CM_HEAD_SIZE;
+       for (i = 1; i < IPOIB_CM_RX_SG; ++i)
+               priv->cm.rx_sge[i].length = PAGE_SIZE;
+       priv->cm.rx_wr.next = NULL;
+       priv->cm.rx_wr.sg_list = priv->cm.rx_sge;
+       priv->cm.rx_wr.num_sge = IPOIB_CM_RX_SG;
+
+       for (i = 0; i < ipoib_recvq_size; ++i) {
+               if (ipoib_cm_alloc_rx_skb(dev, i, priv->cm.srq_ring[i].mapping)) {
+                       ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+                       ipoib_cm_dev_cleanup(dev);
+                       return -ENOMEM;
+               }
+               if (ipoib_cm_post_receive(dev, i)) {
+                       ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
+                       ipoib_cm_dev_cleanup(dev);
+                       return -EIO;
+               }
+       }
+
+       priv->dev->dev_addr[0] = IPOIB_FLAGS_RC;
+       return 0;
+}
+
+void ipoib_cm_dev_cleanup(struct net_device *dev)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       int i, ret;
+
+       if (!priv->cm.srq)
+               return;
+
+       ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
+
+       ret = ib_destroy_srq(priv->cm.srq);
+       if (ret)
+               ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
+
+       priv->cm.srq = NULL;
+       if (!priv->cm.srq_ring)
+               return;
+       for (i = 0; i < ipoib_recvq_size; ++i)
+               if (priv->cm.srq_ring[i].skb) {
+                       ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[i].mapping);
+                       dev_kfree_skb_any(priv->cm.srq_ring[i].skb);
+                       priv->cm.srq_ring[i].skb = NULL;
+               }
+       kfree(priv->cm.srq_ring);
+       priv->cm.srq_ring = NULL;
+}
index 59d9594ed6d97e0fc243066f42f94c8f8f3c0ca6..f2aa923ddbeaf7a39bbaf2ae11cbf1c2180067da 100644 (file)
@@ -50,8 +50,6 @@ MODULE_PARM_DESC(data_debug_level,
                 "Enable data path debug tracing if > 0");
 #endif
 
-#define        IPOIB_OP_RECV   (1ul << 31)
-
 static DEFINE_MUTEX(pkey_mutex);
 
 struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
@@ -268,10 +266,11 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 
        spin_lock_irqsave(&priv->tx_lock, flags);
        ++priv->tx_tail;
-       if (netif_queue_stopped(dev) &&
-           test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags) &&
-           priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1)
+       if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) &&
+           priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) {
+               clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
                netif_wake_queue(dev);
+       }
        spin_unlock_irqrestore(&priv->tx_lock, flags);
 
        if (wc->status != IB_WC_SUCCESS &&
@@ -283,7 +282,9 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 
 static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc)
 {
-       if (wc->wr_id & IPOIB_OP_RECV)
+       if (wc->wr_id & IPOIB_CM_OP_SRQ)
+               ipoib_cm_handle_rx_wc(dev, wc);
+       else if (wc->wr_id & IPOIB_OP_RECV)
                ipoib_ib_handle_rx_wc(dev, wc);
        else
                ipoib_ib_handle_tx_wc(dev, wc);
@@ -327,12 +328,12 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
        struct ipoib_tx_buf *tx_req;
        u64 addr;
 
-       if (unlikely(skb->len > dev->mtu + INFINIBAND_ALEN)) {
+       if (unlikely(skb->len > priv->mcast_mtu + INFINIBAND_ALEN)) {
                ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
-                          skb->len, dev->mtu + INFINIBAND_ALEN);
+                          skb->len, priv->mcast_mtu + INFINIBAND_ALEN);
                ++priv->stats.tx_dropped;
                ++priv->stats.tx_errors;
-               dev_kfree_skb_any(skb);
+               ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
                return;
        }
 
@@ -372,6 +373,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
                if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
                        ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
                        netif_stop_queue(dev);
+                       set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
                }
        }
 }
@@ -424,6 +426,13 @@ int ipoib_ib_dev_open(struct net_device *dev)
                return -1;
        }
 
+       ret = ipoib_cm_dev_open(dev);
+       if (ret) {
+               ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
+               ipoib_ib_dev_stop(dev);
+               return -1;
+       }
+
        clear_bit(IPOIB_STOP_REAPER, &priv->flags);
        queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
 
@@ -509,6 +518,8 @@ int ipoib_ib_dev_stop(struct net_device *dev)
 
        clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
 
+       ipoib_cm_dev_stop(dev);
+
        /*
         * Move our QP to the error state and then reinitialize in
         * when all work requests have completed or have been flushed.
index af5ee2ec4499799a80b8e98f4a7a7a02628fbb93..18d27fd352ad9667972b71a6a2242a12ded97d39 100644 (file)
@@ -49,8 +49,6 @@
 
 #include <net/dst.h>
 
-#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
-
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
 MODULE_LICENSE("Dual BSD/GPL");
@@ -145,6 +143,8 @@ static int ipoib_stop(struct net_device *dev)
 
        netif_stop_queue(dev);
 
+       clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
+
        /*
         * Now flush workqueue to make sure a scheduled task doesn't
         * bring our internal state back up.
@@ -178,8 +178,18 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
 
-       if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
+       /* dev->mtu > 2K ==> connected mode */
+       if (ipoib_cm_admin_enabled(dev) && new_mtu <= IPOIB_CM_MTU) {
+               if (new_mtu > priv->mcast_mtu)
+                       ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
+                                  priv->mcast_mtu);
+               dev->mtu = new_mtu;
+               return 0;
+       }
+
+       if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) {
                return -EINVAL;
+       }
 
        priv->admin_mtu = new_mtu;
 
@@ -414,6 +424,20 @@ static void path_rec_completion(int status,
                        memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
                               sizeof(union ib_gid));
 
+                       if (ipoib_cm_enabled(dev, neigh->neighbour)) {
+                               if (!ipoib_cm_get(neigh))
+                                       ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
+                                                                              path,
+                                                                              neigh));
+                               if (!ipoib_cm_get(neigh)) {
+                                       list_del(&neigh->list);
+                                       if (neigh->ah)
+                                               ipoib_put_ah(neigh->ah);
+                                       ipoib_neigh_free(dev, neigh);
+                                       continue;
+                               }
+                       }
+
                        while ((skb = __skb_dequeue(&neigh->queue)))
                                __skb_queue_tail(&skqueue, skb);
                }
@@ -520,7 +544,25 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
                memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
                       sizeof(union ib_gid));
 
-               ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
+               if (ipoib_cm_enabled(dev, neigh->neighbour)) {
+                       if (!ipoib_cm_get(neigh))
+                               ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
+                       if (!ipoib_cm_get(neigh)) {
+                               list_del(&neigh->list);
+                               if (neigh->ah)
+                                       ipoib_put_ah(neigh->ah);
+                               ipoib_neigh_free(dev, neigh);
+                               goto err_drop;
+                       }
+                       if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
+                               __skb_queue_tail(&neigh->queue, skb);
+                       else {
+                               ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
+                                          skb_queue_len(&neigh->queue));
+                               goto err_drop;
+                       }
+               } else
+                       ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
        } else {
                neigh->ah  = NULL;
 
@@ -538,6 +580,7 @@ err_list:
 
 err_path:
        ipoib_neigh_free(dev, neigh);
+err_drop:
        ++priv->stats.tx_dropped;
        dev_kfree_skb_any(skb);
 
@@ -640,7 +683,12 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
                neigh = *to_ipoib_neigh(skb->dst->neighbour);
 
-               if (likely(neigh->ah)) {
+               if (ipoib_cm_get(neigh)) {
+                       if (ipoib_cm_up(neigh)) {
+                               ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
+                               goto out;
+                       }
+               } else if (neigh->ah) {
                        if (unlikely(memcmp(&neigh->dgid.raw,
                                            skb->dst->neighbour->ha + 4,
                                            sizeof(union ib_gid)))) {
@@ -805,6 +853,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour)
        neigh->neighbour = neighbour;
        *to_ipoib_neigh(neighbour) = neigh;
        skb_queue_head_init(&neigh->queue);
+       ipoib_cm_set(neigh, NULL);
 
        return neigh;
 }
@@ -818,6 +867,8 @@ void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh)
                ++priv->stats.tx_dropped;
                dev_kfree_skb_any(skb);
        }
+       if (ipoib_cm_get(neigh))
+               ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
        kfree(neigh);
 }
 
@@ -1080,6 +1131,8 @@ static struct net_device *ipoib_add_port(const char *format,
 
        ipoib_create_debug_files(priv->dev);
 
+       if (ipoib_cm_add_mode_attr(priv->dev))
+               goto sysfs_failed;
        if (ipoib_add_pkey_attr(priv->dev))
                goto sysfs_failed;
        if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
index b04b72ca32eda5816e79780ed5553f51d4e53d1e..fea737f520fdfeae5a8a7b85765799552406b379 100644 (file)
@@ -597,7 +597,9 @@ void ipoib_mcast_join_task(struct work_struct *work)
 
        priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
                IPOIB_ENCAP_LEN;
-       dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
+
+       if (!ipoib_cm_admin_enabled(dev))
+               dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
 
        ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
 
index 7b717c648f727bb52ac8d33fcb5bcb6de9cefe5e..3cb551b8875625960654e587facebf02e3fcd838 100644 (file)
@@ -168,35 +168,41 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
                .qp_type     = IB_QPT_UD
        };
 
+       int ret, size;
+
        priv->pd = ib_alloc_pd(priv->ca);
        if (IS_ERR(priv->pd)) {
                printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name);
                return -ENODEV;
        }
 
-       priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev,
-                               ipoib_sendq_size + ipoib_recvq_size + 1);
+       priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
+       if (IS_ERR(priv->mr)) {
+               printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
+               goto out_free_pd;
+       }
+
+       size = ipoib_sendq_size + ipoib_recvq_size + 1;
+       ret = ipoib_cm_dev_init(dev);
+       if (!ret)
+               size += ipoib_recvq_size;
+
+       priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size);
        if (IS_ERR(priv->cq)) {
                printk(KERN_WARNING "%s: failed to create CQ\n", ca->name);
-               goto out_free_pd;
+               goto out_free_mr;
        }
 
        if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP))
                goto out_free_cq;
 
-       priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
-       if (IS_ERR(priv->mr)) {
-               printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
-               goto out_free_cq;
-       }
-
        init_attr.send_cq = priv->cq;
        init_attr.recv_cq = priv->cq,
 
        priv->qp = ib_create_qp(priv->pd, &init_attr);
        if (IS_ERR(priv->qp)) {
                printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
-               goto out_free_mr;
+               goto out_free_cq;
        }
 
        priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
@@ -212,12 +218,12 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 
        return 0;
 
-out_free_mr:
-       ib_dereg_mr(priv->mr);
-
 out_free_cq:
        ib_destroy_cq(priv->cq);
 
+out_free_mr:
+       ib_dereg_mr(priv->mr);
+
 out_free_pd:
        ib_dealloc_pd(priv->pd);
        return -ENODEV;
@@ -235,12 +241,14 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
                clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
        }
 
-       if (ib_dereg_mr(priv->mr))
-               ipoib_warn(priv, "ib_dereg_mr failed\n");
-
        if (ib_destroy_cq(priv->cq))
                ipoib_warn(priv, "ib_cq_destroy failed\n");
 
+       ipoib_cm_dev_cleanup(dev);
+
+       if (ib_dereg_mr(priv->mr))
+               ipoib_warn(priv, "ib_dereg_mr failed\n");
+
        if (ib_dealloc_pd(priv->pd))
                ipoib_warn(priv, "ib_dealloc_pd failed\n");
 }
index 085eafe6667cf3626274d4e4f462705a3b993af3..6762988439d1f6fcd96b82aad8a25f51e7bf5a86 100644 (file)
@@ -115,6 +115,8 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
 
        ipoib_create_debug_files(priv->dev);
 
+       if (ipoib_cm_add_mode_attr(priv->dev))
+               goto sysfs_failed;
        if (ipoib_add_pkey_attr(priv->dev))
                goto sysfs_failed;