Browse Source

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband: (81 commits)
  RDMA/cxgb3: Fix the T3A workaround checks
  IB/ipath: Remove unnecessary cast
  IPoIB: Constify seq_operations function pointer tables
  RDMA/cxgb3: Mark QP as privileged based on user capabilities
  RDMA/cxgb3: Fix page shift calculation in build_phys_page_list()
  RDMA/cxgb3: Flush the receive queue when closing
  IB/ipath: Trivial simplification of ipath_make_ud_req()
  IB/mthca: Update latest "native Arbel" firmware revision
  IPoIB: Remove redundant check of netif_queue_stopped() in xmit handler
  IB/ipath: Add mappings from HW register to PortInfo port physical state
  IB/ipath: Changes to support PIO bandwidth check on IBA7220
  IB/ipath: Minor cleanup of unused fields and chip-specific errors
  IB/ipath: New sysfs entries to control 7220 features
  IB/ipath: Add new chip-specific functions to older chips, consistent init
  IB/ipath: Remove unused MDIO interface code
  IB/ehca: Prevent RDMA-related connection failures on some eHCA2 hardware
  IB/ehca: Add "port connection autodetect mode"
  IB/ehca: Define array to store SMI/GSI QPs
  IB/ehca: Remove CQ-QP-link before destroying QP in error path of create_qp()
  IB/iser: Add change_queue_depth method
  ...
Linus Torvalds 17 years ago
parent
commit
99f1c97dbd
75 changed files with 3124 additions and 1185 deletions
  1. 0 10
      Documentation/feature-removal-schedule.txt
  2. 288 18
      drivers/infiniband/core/cm.c
  3. 33 27
      drivers/infiniband/core/cma.c
  4. 21 12
      drivers/infiniband/core/fmr_pool.c
  5. 10 16
      drivers/infiniband/core/mad.c
  6. 2 1
      drivers/infiniband/core/mad_priv.h
  7. 1 1
      drivers/infiniband/core/mad_rmpp.c
  8. 45 10
      drivers/infiniband/core/multicast.c
  9. 17 1
      drivers/infiniband/core/smi.h
  10. 15 22
      drivers/infiniband/core/ucm.c
  11. 92 0
      drivers/infiniband/core/ucma.c
  12. 53 62
      drivers/infiniband/core/user_mad.c
  13. 2 2
      drivers/infiniband/hw/cxgb3/cxio_hal.c
  14. 3 2
      drivers/infiniband/hw/cxgb3/cxio_wr.h
  15. 2 2
      drivers/infiniband/hw/cxgb3/iwch_cm.c
  16. 7 0
      drivers/infiniband/hw/cxgb3/iwch_mem.c
  17. 6 1
      drivers/infiniband/hw/cxgb3/iwch_provider.c
  18. 8 21
      drivers/infiniband/hw/cxgb3/iwch_qp.c
  19. 1 1
      drivers/infiniband/hw/ehca/ehca_av.c
  20. 22 1
      drivers/infiniband/hw/ehca/ehca_classes.h
  21. 1 1
      drivers/infiniband/hw/ehca/ehca_cq.c
  22. 33 5
      drivers/infiniband/hw/ehca/ehca_irq.c
  23. 2 0
      drivers/infiniband/hw/ehca/ehca_iverbs.h
  24. 9 6
      drivers/infiniband/hw/ehca/ehca_main.c
  25. 169 11
      drivers/infiniband/hw/ehca/ehca_qp.c
  26. 80 32
      drivers/infiniband/hw/ehca/ehca_reqs.c
  27. 3 3
      drivers/infiniband/hw/ehca/ehca_sqp.c
  28. 31 4
      drivers/infiniband/hw/ipath/ipath_common.h
  29. 1 1
      drivers/infiniband/hw/ipath/ipath_cq.c
  30. 2 2
      drivers/infiniband/hw/ipath/ipath_debug.h
  31. 74 106
      drivers/infiniband/hw/ipath/ipath_driver.c
  32. 11 12
      drivers/infiniband/hw/ipath/ipath_eeprom.c
  33. 51 43
      drivers/infiniband/hw/ipath/ipath_file_ops.c
  34. 4 10
      drivers/infiniband/hw/ipath/ipath_fs.c
  35. 346 49
      drivers/infiniband/hw/ipath/ipath_iba6110.c
  36. 350 89
      drivers/infiniband/hw/ipath/ipath_iba6120.c
  37. 27 40
      drivers/infiniband/hw/ipath/ipath_init_chip.c
  38. 39 42
      drivers/infiniband/hw/ipath/ipath_intr.c
  39. 149 52
      drivers/infiniband/hw/ipath/ipath_kernel.h
  40. 2 3
      drivers/infiniband/hw/ipath/ipath_keys.c
  41. 77 46
      drivers/infiniband/hw/ipath/ipath_mad.c
  42. 2 4
      drivers/infiniband/hw/ipath/ipath_qp.c
  43. 7 11
      drivers/infiniband/hw/ipath/ipath_rc.c
  44. 15 18
      drivers/infiniband/hw/ipath/ipath_registers.h
  45. 11 2
      drivers/infiniband/hw/ipath/ipath_ruc.c
  46. 2 2
      drivers/infiniband/hw/ipath/ipath_srq.c
  47. 12 12
      drivers/infiniband/hw/ipath/ipath_stats.c
  48. 364 0
      drivers/infiniband/hw/ipath/ipath_sysfs.c
  49. 1 2
      drivers/infiniband/hw/ipath/ipath_ud.c
  50. 37 18
      drivers/infiniband/hw/ipath/ipath_verbs.c
  51. 12 0
      drivers/infiniband/hw/ipath/ipath_verbs.h
  52. 5 4
      drivers/infiniband/hw/mlx4/cq.c
  53. 6 7
      drivers/infiniband/hw/mthca/mthca_dev.h
  54. 2 4
      drivers/infiniband/hw/mthca/mthca_eq.c
  55. 6 34
      drivers/infiniband/hw/mthca/mthca_main.c
  56. 108 76
      drivers/infiniband/ulp/ipoib/ipoib.h
  57. 274 102
      drivers/infiniband/ulp/ipoib/ipoib_cm.c
  58. 2 2
      drivers/infiniband/ulp/ipoib/ipoib_fs.c
  59. 4 4
      drivers/infiniband/ulp/ipoib/ipoib_ib.c
  60. 28 32
      drivers/infiniband/ulp/ipoib/ipoib_main.c
  61. 2 6
      drivers/infiniband/ulp/ipoib/ipoib_multicast.c
  62. 11 7
      drivers/infiniband/ulp/ipoib/ipoib_verbs.c
  63. 2 2
      drivers/infiniband/ulp/iser/Kconfig
  64. 1 0
      drivers/infiniband/ulp/iser/iscsi_iser.c
  65. 1 1
      drivers/infiniband/ulp/iser/iser_initiator.c
  66. 3 5
      drivers/infiniband/ulp/iser/iser_verbs.c
  67. 81 50
      drivers/infiniband/ulp/srp/ib_srp.c
  68. 5 0
      drivers/infiniband/ulp/srp/ib_srp.h
  69. 1 1
      drivers/net/mlx4/fw.c
  70. 7 4
      include/net/if_inet6.h
  71. 6 4
      include/net/ip.h
  72. 3 1
      include/rdma/ib_mad.h
  73. 12 1
      include/rdma/rdma_user_cm.h
  74. 1 1
      net/ipv4/arp.c
  75. 1 1
      net/ipv6/ndisc.c

+ 0 - 10
Documentation/feature-removal-schedule.txt

@@ -295,16 +295,6 @@ Who:  linuxppc-dev@ozlabs.org
 
 ---------------------------
 
-What:	mthca driver's MSI support
-When:	January 2008
-Files:	drivers/infiniband/hw/mthca/*.[ch]
-Why:	All mthca hardware also supports MSI-X, which provides
-	strictly more functionality than MSI.  So there is no point in
-	having both MSI-X and MSI support in the driver.
-Who:	Roland Dreier <rolandd@cisco.com>
-
----------------------------
-
 What:   sk98lin network driver
 When:   Feburary 2008
 Why:    In kernel tree version of driver is unmaintained. Sk98lin driver

+ 288 - 18
drivers/infiniband/core/cm.c

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2006 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Intel Corporation.  All rights reserved.
  * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
  * Copyright (c) 2004, 2005 Voltaire Corporation.  All rights reserved.
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
@@ -37,12 +37,14 @@
 
 #include <linux/completion.h>
 #include <linux/dma-mapping.h>
+#include <linux/device.h>
 #include <linux/err.h>
 #include <linux/idr.h>
 #include <linux/interrupt.h>
 #include <linux/random.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock.h>
+#include <linux/sysfs.h>
 #include <linux/workqueue.h>
 
 #include <rdma/ib_cache.h>
@@ -78,17 +80,94 @@ static struct ib_cm {
 	struct workqueue_struct *wq;
 } cm;
 
+/* Counter indexes ordered by attribute ID */
+enum {
+	CM_REQ_COUNTER,
+	CM_MRA_COUNTER,
+	CM_REJ_COUNTER,
+	CM_REP_COUNTER,
+	CM_RTU_COUNTER,
+	CM_DREQ_COUNTER,
+	CM_DREP_COUNTER,
+	CM_SIDR_REQ_COUNTER,
+	CM_SIDR_REP_COUNTER,
+	CM_LAP_COUNTER,
+	CM_APR_COUNTER,
+	CM_ATTR_COUNT,
+	CM_ATTR_ID_OFFSET = 0x0010,
+};
+
+enum {
+	CM_XMIT,
+	CM_XMIT_RETRIES,
+	CM_RECV,
+	CM_RECV_DUPLICATES,
+	CM_COUNTER_GROUPS
+};
+
+static char const counter_group_names[CM_COUNTER_GROUPS]
+				     [sizeof("cm_rx_duplicates")] = {
+	"cm_tx_msgs", "cm_tx_retries",
+	"cm_rx_msgs", "cm_rx_duplicates"
+};
+
+struct cm_counter_group {
+	struct kobject obj;
+	atomic_long_t counter[CM_ATTR_COUNT];
+};
+
+struct cm_counter_attribute {
+	struct attribute attr;
+	int index;
+};
+
+#define CM_COUNTER_ATTR(_name, _index) \
+struct cm_counter_attribute cm_##_name##_counter_attr = { \
+	.attr = { .name = __stringify(_name), .mode = 0444, .owner = THIS_MODULE }, \
+	.index = _index \
+}
+
+static CM_COUNTER_ATTR(req, CM_REQ_COUNTER);
+static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER);
+static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER);
+static CM_COUNTER_ATTR(rep, CM_REP_COUNTER);
+static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER);
+static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER);
+static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER);
+static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER);
+static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER);
+static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER);
+static CM_COUNTER_ATTR(apr, CM_APR_COUNTER);
+
+static struct attribute *cm_counter_default_attrs[] = {
+	&cm_req_counter_attr.attr,
+	&cm_mra_counter_attr.attr,
+	&cm_rej_counter_attr.attr,
+	&cm_rep_counter_attr.attr,
+	&cm_rtu_counter_attr.attr,
+	&cm_dreq_counter_attr.attr,
+	&cm_drep_counter_attr.attr,
+	&cm_sidr_req_counter_attr.attr,
+	&cm_sidr_rep_counter_attr.attr,
+	&cm_lap_counter_attr.attr,
+	&cm_apr_counter_attr.attr,
+	NULL
+};
+
 struct cm_port {
 	struct cm_device *cm_dev;
 	struct ib_mad_agent *mad_agent;
+	struct kobject port_obj;
 	u8 port_num;
+	struct cm_counter_group counter_group[CM_COUNTER_GROUPS];
 };
 
 struct cm_device {
 	struct list_head list;
 	struct ib_device *device;
+	struct kobject dev_obj;
 	u8 ack_delay;
-	struct cm_port port[0];
+	struct cm_port *port[0];
 };
 
 struct cm_av {
@@ -278,7 +357,7 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
 	list_for_each_entry(cm_dev, &cm.device_list, list) {
 		if (!ib_find_cached_gid(cm_dev->device, &path->sgid,
 					&p, NULL)) {
-			port = &cm_dev->port[p-1];
+			port = cm_dev->port[p-1];
 			break;
 		}
 	}
@@ -1270,6 +1349,9 @@ static void cm_dup_req_handler(struct cm_work *work,
 	struct ib_mad_send_buf *msg = NULL;
 	int ret;
 
+	atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+			counter[CM_REQ_COUNTER]);
+
 	/* Quick state check to discard duplicate REQs. */
 	if (cm_id_priv->id.state == IB_CM_REQ_RCVD)
 		return;
@@ -1616,6 +1698,8 @@ static void cm_dup_rep_handler(struct cm_work *work)
 	if (!cm_id_priv)
 		return;
 
+	atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+			counter[CM_REP_COUNTER]);
 	ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
 	if (ret)
 		goto deref;
@@ -1781,6 +1865,8 @@ static int cm_rtu_handler(struct cm_work *work)
 	if (cm_id_priv->id.state != IB_CM_REP_SENT &&
 	    cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) {
 		spin_unlock_irq(&cm_id_priv->lock);
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_RTU_COUNTER]);
 		goto out;
 	}
 	cm_id_priv->id.state = IB_CM_ESTABLISHED;
@@ -1958,6 +2044,8 @@ static int cm_dreq_handler(struct cm_work *work)
 	cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id,
 				   dreq_msg->local_comm_id);
 	if (!cm_id_priv) {
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
 		cm_issue_drep(work->port, work->mad_recv_wc);
 		return -EINVAL;
 	}
@@ -1977,6 +2065,8 @@ static int cm_dreq_handler(struct cm_work *work)
 	case IB_CM_MRA_REP_RCVD:
 		break;
 	case IB_CM_TIMEWAIT:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
 		if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
 			goto unlock;
 
@@ -1988,6 +2078,10 @@ static int cm_dreq_handler(struct cm_work *work)
 		if (ib_post_send_mad(msg, NULL))
 			cm_free_msg(msg);
 		goto deref;
+	case IB_CM_DREQ_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		goto unlock;
 	default:
 		goto unlock;
 	}
@@ -2339,10 +2433,20 @@ static int cm_mra_handler(struct cm_work *work)
 		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_OTHER ||
 		    cm_id_priv->id.lap_state != IB_CM_LAP_SENT ||
 		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
-				  cm_id_priv->msg, timeout))
+				  cm_id_priv->msg, timeout)) {
+			if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+				atomic_long_inc(&work->port->
+						counter_group[CM_RECV_DUPLICATES].
+						counter[CM_MRA_COUNTER]);
 			goto out;
+		}
 		cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD;
 		break;
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_MRA_REP_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_MRA_COUNTER]);
+		/* fall through */
 	default:
 		goto out;
 	}
@@ -2502,6 +2606,8 @@ static int cm_lap_handler(struct cm_work *work)
 	case IB_CM_LAP_IDLE:
 		break;
 	case IB_CM_MRA_LAP_SENT:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_LAP_COUNTER]);
 		if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
 			goto unlock;
 
@@ -2515,6 +2621,10 @@ static int cm_lap_handler(struct cm_work *work)
 		if (ib_post_send_mad(msg, NULL))
 			cm_free_msg(msg);
 		goto deref;
+	case IB_CM_LAP_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_LAP_COUNTER]);
+		goto unlock;
 	default:
 		goto unlock;
 	}
@@ -2796,6 +2906,8 @@ static int cm_sidr_req_handler(struct cm_work *work)
 	cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv);
 	if (cur_cm_id_priv) {
 		spin_unlock_irq(&cm.lock);
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_SIDR_REQ_COUNTER]);
 		goto out; /* Duplicate message. */
 	}
 	cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
@@ -2990,6 +3102,27 @@ static void cm_send_handler(struct ib_mad_agent *mad_agent,
 			    struct ib_mad_send_wc *mad_send_wc)
 {
 	struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
+	struct cm_port *port;
+	u16 attr_index;
+
+	port = mad_agent->context;
+	attr_index = be16_to_cpu(((struct ib_mad_hdr *)
+				  msg->mad)->attr_id) - CM_ATTR_ID_OFFSET;
+
+	/*
+	 * If the send was in response to a received message (context[0] is not
+	 * set to a cm_id), and is not a REJ, then it is a send that was
+	 * manually retried.
+	 */
+	if (!msg->context[0] && (attr_index != CM_REJ_COUNTER))
+		msg->retries = 1;
+
+	atomic_long_add(1 + msg->retries,
+			&port->counter_group[CM_XMIT].counter[attr_index]);
+	if (msg->retries)
+		atomic_long_add(msg->retries,
+				&port->counter_group[CM_XMIT_RETRIES].
+				counter[attr_index]);
 
 	switch (mad_send_wc->status) {
 	case IB_WC_SUCCESS:
@@ -3148,8 +3281,10 @@ EXPORT_SYMBOL(ib_cm_notify);
 static void cm_recv_handler(struct ib_mad_agent *mad_agent,
 			    struct ib_mad_recv_wc *mad_recv_wc)
 {
+	struct cm_port *port = mad_agent->context;
 	struct cm_work *work;
 	enum ib_cm_event_type event;
+	u16 attr_id;
 	int paths = 0;
 
 	switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) {
@@ -3194,6 +3329,10 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent,
 		return;
 	}
 
+	attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id);
+	atomic_long_inc(&port->counter_group[CM_RECV].
+			counter[attr_id - CM_ATTR_ID_OFFSET]);
+
 	work = kmalloc(sizeof *work + sizeof(struct ib_sa_path_rec) * paths,
 		       GFP_KERNEL);
 	if (!work) {
@@ -3204,7 +3343,7 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent,
 	INIT_DELAYED_WORK(&work->work, cm_work_handler);
 	work->cm_event.event = event;
 	work->mad_recv_wc = mad_recv_wc;
-	work->port = (struct cm_port *)mad_agent->context;
+	work->port = port;
 	queue_delayed_work(cm.wq, &work->work, 0);
 }
 
@@ -3379,6 +3518,108 @@ static void cm_get_ack_delay(struct cm_device *cm_dev)
 		cm_dev->ack_delay = attr.local_ca_ack_delay;
 }
 
+static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
+			       char *buf)
+{
+	struct cm_counter_group *group;
+	struct cm_counter_attribute *cm_attr;
+
+	group = container_of(obj, struct cm_counter_group, obj);
+	cm_attr = container_of(attr, struct cm_counter_attribute, attr);
+
+	return sprintf(buf, "%ld\n",
+		       atomic_long_read(&group->counter[cm_attr->index]));
+}
+
+static struct sysfs_ops cm_counter_ops = {
+	.show = cm_show_counter
+};
+
+static struct kobj_type cm_counter_obj_type = {
+	.sysfs_ops = &cm_counter_ops,
+	.default_attrs = cm_counter_default_attrs
+};
+
+static void cm_release_port_obj(struct kobject *obj)
+{
+	struct cm_port *cm_port;
+
+	printk(KERN_ERR "free cm port\n");
+
+	cm_port = container_of(obj, struct cm_port, port_obj);
+	kfree(cm_port);
+}
+
+static struct kobj_type cm_port_obj_type = {
+	.release = cm_release_port_obj
+};
+
+static void cm_release_dev_obj(struct kobject *obj)
+{
+	struct cm_device *cm_dev;
+
+	printk(KERN_ERR "free cm dev\n");
+
+	cm_dev = container_of(obj, struct cm_device, dev_obj);
+	kfree(cm_dev);
+}
+
+static struct kobj_type cm_dev_obj_type = {
+	.release = cm_release_dev_obj
+};
+
+struct class cm_class = {
+	.name    = "infiniband_cm",
+};
+EXPORT_SYMBOL(cm_class);
+
+static void cm_remove_fs_obj(struct kobject *obj)
+{
+	kobject_put(obj->parent);
+	kobject_put(obj);
+}
+
+static int cm_create_port_fs(struct cm_port *port)
+{
+	int i, ret;
+
+	ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type,
+				   kobject_get(&port->cm_dev->dev_obj),
+				   "%d", port->port_num);
+	if (ret) {
+		kfree(port);
+		return ret;
+	}
+
+	for (i = 0; i < CM_COUNTER_GROUPS; i++) {
+		ret = kobject_init_and_add(&port->counter_group[i].obj,
+					   &cm_counter_obj_type,
+					   kobject_get(&port->port_obj),
+					   "%s", counter_group_names[i]);
+		if (ret)
+			goto error;
+	}
+
+	return 0;
+
+error:
+	while (i--)
+		cm_remove_fs_obj(&port->counter_group[i].obj);
+	cm_remove_fs_obj(&port->port_obj);
+	return ret;
+
+}
+
+static void cm_remove_port_fs(struct cm_port *port)
+{
+	int i;
+
+	for (i = 0; i < CM_COUNTER_GROUPS; i++)
+		cm_remove_fs_obj(&port->counter_group[i].obj);
+
+	cm_remove_fs_obj(&port->port_obj);
+}
+
 static void cm_add_one(struct ib_device *device)
 {
 	struct cm_device *cm_dev;
@@ -3397,7 +3638,7 @@ static void cm_add_one(struct ib_device *device)
 	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
 		return;
 
-	cm_dev = kmalloc(sizeof(*cm_dev) + sizeof(*port) *
+	cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
 			 device->phys_port_cnt, GFP_KERNEL);
 	if (!cm_dev)
 		return;
@@ -3405,11 +3646,27 @@ static void cm_add_one(struct ib_device *device)
 	cm_dev->device = device;
 	cm_get_ack_delay(cm_dev);
 
+	ret = kobject_init_and_add(&cm_dev->dev_obj, &cm_dev_obj_type,
+				   &cm_class.subsys.kobj, "%s", device->name);
+	if (ret) {
+		kfree(cm_dev);
+		return;
+	}
+
 	set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
 	for (i = 1; i <= device->phys_port_cnt; i++) {
-		port = &cm_dev->port[i-1];
+		port = kzalloc(sizeof *port, GFP_KERNEL);
+		if (!port)
+			goto error1;
+
+		cm_dev->port[i-1] = port;
 		port->cm_dev = cm_dev;
 		port->port_num = i;
+
+		ret = cm_create_port_fs(port);
+		if (ret)
+			goto error1;
+
 		port->mad_agent = ib_register_mad_agent(device, i,
 							IB_QPT_GSI,
 							&reg_req,
@@ -3418,11 +3675,11 @@ static void cm_add_one(struct ib_device *device)
 							cm_recv_handler,
 							port);
 		if (IS_ERR(port->mad_agent))
-			goto error1;
+			goto error2;
 
 		ret = ib_modify_port(device, i, 0, &port_modify);
 		if (ret)
-			goto error2;
+			goto error3;
 	}
 	ib_set_client_data(device, &cm_client, cm_dev);
 
@@ -3431,17 +3688,20 @@ static void cm_add_one(struct ib_device *device)
 	write_unlock_irqrestore(&cm.device_lock, flags);
 	return;
 
-error2:
+error3:
 	ib_unregister_mad_agent(port->mad_agent);
+error2:
+	cm_remove_port_fs(port);
 error1:
 	port_modify.set_port_cap_mask = 0;
 	port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
 	while (--i) {
-		port = &cm_dev->port[i-1];
+		port = cm_dev->port[i-1];
 		ib_modify_port(device, port->port_num, 0, &port_modify);
 		ib_unregister_mad_agent(port->mad_agent);
+		cm_remove_port_fs(port);
 	}
-	kfree(cm_dev);
+	cm_remove_fs_obj(&cm_dev->dev_obj);
 }
 
 static void cm_remove_one(struct ib_device *device)
@@ -3463,11 +3723,12 @@ static void cm_remove_one(struct ib_device *device)
 	write_unlock_irqrestore(&cm.device_lock, flags);
 
 	for (i = 1; i <= device->phys_port_cnt; i++) {
-		port = &cm_dev->port[i-1];
+		port = cm_dev->port[i-1];
 		ib_modify_port(device, port->port_num, 0, &port_modify);
 		ib_unregister_mad_agent(port->mad_agent);
+		cm_remove_port_fs(port);
 	}
-	kfree(cm_dev);
+	cm_remove_fs_obj(&cm_dev->dev_obj);
 }
 
 static int __init ib_cm_init(void)
@@ -3488,17 +3749,25 @@ static int __init ib_cm_init(void)
 	idr_pre_get(&cm.local_id_table, GFP_KERNEL);
 	INIT_LIST_HEAD(&cm.timewait_list);
 
-	cm.wq = create_workqueue("ib_cm");
-	if (!cm.wq)
+	ret = class_register(&cm_class);
+	if (ret)
 		return -ENOMEM;
 
+	cm.wq = create_workqueue("ib_cm");
+	if (!cm.wq) {
+		ret = -ENOMEM;
+		goto error1;
+	}
+
 	ret = ib_register_client(&cm_client);
 	if (ret)
-		goto error;
+		goto error2;
 
 	return 0;
-error:
+error2:
 	destroy_workqueue(cm.wq);
+error1:
+	class_unregister(&cm_class);
 	return ret;
 }
 
@@ -3519,6 +3788,7 @@ static void __exit ib_cm_cleanup(void)
 	}
 
 	ib_unregister_client(&cm_client);
+	class_unregister(&cm_class);
 	idr_destroy(&cm.local_id_table);
 }
 

+ 33 - 27
drivers/infiniband/core/cma.c

@@ -488,7 +488,8 @@ void rdma_destroy_qp(struct rdma_cm_id *id)
 }
 EXPORT_SYMBOL(rdma_destroy_qp);
 
-static int cma_modify_qp_rtr(struct rdma_id_private *id_priv)
+static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
+			     struct rdma_conn_param *conn_param)
 {
 	struct ib_qp_attr qp_attr;
 	int qp_attr_mask, ret;
@@ -514,13 +515,16 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv)
 	if (ret)
 		goto out;
 
+	if (conn_param)
+		qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
 	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
 out:
 	mutex_unlock(&id_priv->qp_mutex);
 	return ret;
 }
 
-static int cma_modify_qp_rts(struct rdma_id_private *id_priv)
+static int cma_modify_qp_rts(struct rdma_id_private *id_priv,
+			     struct rdma_conn_param *conn_param)
 {
 	struct ib_qp_attr qp_attr;
 	int qp_attr_mask, ret;
@@ -536,6 +540,8 @@ static int cma_modify_qp_rts(struct rdma_id_private *id_priv)
 	if (ret)
 		goto out;
 
+	if (conn_param)
+		qp_attr.max_rd_atomic = conn_param->initiator_depth;
 	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
 out:
 	mutex_unlock(&id_priv->qp_mutex);
@@ -866,11 +872,11 @@ static int cma_rep_recv(struct rdma_id_private *id_priv)
 {
 	int ret;
 
-	ret = cma_modify_qp_rtr(id_priv);
+	ret = cma_modify_qp_rtr(id_priv, NULL);
 	if (ret)
 		goto reject;
 
-	ret = cma_modify_qp_rts(id_priv);
+	ret = cma_modify_qp_rts(id_priv, NULL);
 	if (ret)
 		goto reject;
 
@@ -1122,8 +1128,10 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
 	cm_id->cm_handler = cma_ib_handler;
 
 	ret = conn_id->id.event_handler(&conn_id->id, &event);
-	if (!ret)
+	if (!ret) {
+		cma_enable_remove(conn_id);
 		goto out;
+	}
 
 	/* Destroy the CM ID by returning a non-zero value. */
 	conn_id->cm_id.ib = NULL;
@@ -1262,6 +1270,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 	struct net_device *dev = NULL;
 	struct rdma_cm_event event;
 	int ret;
+	struct ib_device_attr attr;
 
 	listen_id = cm_id->context;
 	if (cma_disable_remove(listen_id, CMA_LISTEN))
@@ -1311,10 +1320,19 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
 	sin = (struct sockaddr_in *) &new_cm_id->route.addr.dst_addr;
 	*sin = iw_event->remote_addr;
 
+	ret = ib_query_device(conn_id->id.device, &attr);
+	if (ret) {
+		cma_enable_remove(conn_id);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+
 	memset(&event, 0, sizeof event);
 	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
 	event.param.conn.private_data = iw_event->private_data;
 	event.param.conn.private_data_len = iw_event->private_data_len;
+	event.param.conn.initiator_depth = attr.max_qp_init_rd_atom;
+	event.param.conn.responder_resources = attr.max_qp_rd_atom;
 	ret = conn_id->id.event_handler(&conn_id->id, &event);
 	if (ret) {
 		/* User wants to destroy the CM ID */
@@ -2272,7 +2290,7 @@ static int cma_connect_iw(struct rdma_id_private *id_priv,
 	sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr;
 	cm_id->remote_addr = *sin;
 
-	ret = cma_modify_qp_rtr(id_priv);
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
 	if (ret)
 		goto out;
 
@@ -2335,25 +2353,15 @@ static int cma_accept_ib(struct rdma_id_private *id_priv,
 			 struct rdma_conn_param *conn_param)
 {
 	struct ib_cm_rep_param rep;
-	struct ib_qp_attr qp_attr;
-	int qp_attr_mask, ret;
-
-	if (id_priv->id.qp) {
-		ret = cma_modify_qp_rtr(id_priv);
-		if (ret)
-			goto out;
+	int ret;
 
-		qp_attr.qp_state = IB_QPS_RTS;
-		ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, &qp_attr,
-					 &qp_attr_mask);
-		if (ret)
-			goto out;
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		goto out;
 
-		qp_attr.max_rd_atomic = conn_param->initiator_depth;
-		ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
-		if (ret)
-			goto out;
-	}
+	ret = cma_modify_qp_rts(id_priv, conn_param);
+	if (ret)
+		goto out;
 
 	memset(&rep, 0, sizeof rep);
 	rep.qp_num = id_priv->qp_num;
@@ -2378,7 +2386,7 @@ static int cma_accept_iw(struct rdma_id_private *id_priv,
 	struct iw_cm_conn_param iw_param;
 	int ret;
 
-	ret = cma_modify_qp_rtr(id_priv);
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
 	if (ret)
 		return ret;
 
@@ -2598,11 +2606,9 @@ static void cma_set_mgid(struct rdma_id_private *id_priv,
 		/* IPv6 address is an SA assigned MGID. */
 		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
 	} else {
-		ip_ib_mc_map(sin->sin_addr.s_addr, mc_map);
+		ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
 		if (id_priv->id.ps == RDMA_PS_UDP)
 			mc_map[7] = 0x01;	/* Use RDMA CM signature */
-		mc_map[8] = ib_addr_get_pkey(dev_addr) >> 8;
-		mc_map[9] = (unsigned char) ib_addr_get_pkey(dev_addr);
 		*mgid = *(union ib_gid *) (mc_map + 4);
 	}
 }

+ 21 - 12
drivers/infiniband/core/fmr_pool.c

@@ -139,7 +139,7 @@ static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool,
 static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
 {
 	int                 ret;
-	struct ib_pool_fmr *fmr;
+	struct ib_pool_fmr *fmr, *next;
 	LIST_HEAD(unmap_list);
 	LIST_HEAD(fmr_list);
 
@@ -158,6 +158,20 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
 #endif
 	}
 
+	/*
+	 * The free_list may hold FMRs that have been put there
+	 * because they haven't reached the max_remap count.
+	 * Invalidate their mapping as well.
+	 */
+	list_for_each_entry_safe(fmr, next, &pool->free_list, list) {
+		if (fmr->remap_count == 0)
+			continue;
+		hlist_del_init(&fmr->cache_node);
+		fmr->remap_count = 0;
+		list_add_tail(&fmr->fmr->list, &fmr_list);
+		list_move(&fmr->list, &unmap_list);
+	}
+
 	list_splice(&pool->dirty_list, &unmap_list);
 	INIT_LIST_HEAD(&pool->dirty_list);
 	pool->dirty_len = 0;
@@ -182,8 +196,7 @@ static int ib_fmr_cleanup_thread(void *pool_ptr)
 	struct ib_fmr_pool *pool = pool_ptr;
 
 	do {
-		if (pool->dirty_len >= pool->dirty_watermark ||
-		    atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) {
+		if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) {
 			ib_fmr_batch_release(pool);
 
 			atomic_inc(&pool->flush_ser);
@@ -194,8 +207,7 @@ static int ib_fmr_cleanup_thread(void *pool_ptr)
 		}
 
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (pool->dirty_len < pool->dirty_watermark &&
-		    atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 &&
+		if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 &&
 		    !kthread_should_stop())
 			schedule();
 		__set_current_state(TASK_RUNNING);
@@ -369,11 +381,6 @@ void ib_destroy_fmr_pool(struct ib_fmr_pool *pool)
 
 	i = 0;
 	list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) {
-		if (fmr->remap_count) {
-			INIT_LIST_HEAD(&fmr_list);
-			list_add_tail(&fmr->fmr->list, &fmr_list);
-			ib_unmap_fmr(&fmr_list);
-		}
 		ib_dealloc_fmr(fmr->fmr);
 		list_del(&fmr->list);
 		kfree(fmr);
@@ -511,8 +518,10 @@ int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
 			list_add_tail(&fmr->list, &pool->free_list);
 		} else {
 			list_add_tail(&fmr->list, &pool->dirty_list);
-			++pool->dirty_len;
-			wake_up_process(pool->thread);
+			if (++pool->dirty_len >= pool->dirty_watermark) {
+				atomic_inc(&pool->req_ser);
+				wake_up_process(pool->thread);
+			}
 		}
 	}
 

+ 10 - 16
drivers/infiniband/core/mad.c

@@ -701,7 +701,8 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
 	}
 
 	/* Check to post send on QP or process locally */
-	if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD)
+	if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
+	    smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
 		goto out;
 
 	local = kmalloc(sizeof *local, GFP_ATOMIC);
@@ -752,8 +753,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
 		port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
 					    mad_agent_priv->agent.port_num);
 		if (port_priv) {
-			mad_priv->mad.mad.mad_hdr.tid =
-				((struct ib_mad *)smp)->mad_hdr.tid;
+			memcpy(&mad_priv->mad.mad, smp, sizeof(struct ib_mad));
 			recv_mad_agent = find_mad_agent(port_priv,
 						        &mad_priv->mad.mad);
 		}
@@ -1100,7 +1100,9 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
 		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
 		/* Timeout will be updated after send completes */
 		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
-		mad_send_wr->retries = send_buf->retries;
+		mad_send_wr->max_retries = send_buf->retries;
+		mad_send_wr->retries_left = send_buf->retries;
+		send_buf->retries = 0;
 		/* Reference for work request to QP + response */
 		mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
 		mad_send_wr->status = IB_WC_SUCCESS;
@@ -1931,15 +1933,6 @@ local:
 	if (port_priv->device->process_mad) {
 		int ret;
 
-		if (!response) {
-			printk(KERN_ERR PFX "No memory for response MAD\n");
-			/*
-			 * Is it better to assume that
-			 * it wouldn't be processed ?
-			 */
-			goto out;
-		}
-
 		ret = port_priv->device->process_mad(port_priv->device, 0,
 						     port_priv->port_num,
 						     wc, &recv->grh,
@@ -2282,8 +2275,6 @@ static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
 
 	/* Empty wait list to prevent receives from finding a request */
 	list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
-	/* Empty local completion list as well */
-	list_splice_init(&mad_agent_priv->local_list, &cancel_list);
 	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
 	/* Report all cancelled requests */
@@ -2445,9 +2436,12 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
 {
 	int ret;
 
-	if (!mad_send_wr->retries--)
+	if (!mad_send_wr->retries_left)
 		return -ETIMEDOUT;
 
+	mad_send_wr->retries_left--;
+	mad_send_wr->send_buf.retries++;
+
 	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
 
 	if (mad_send_wr->mad_agent_priv->agent.rmpp_version) {

+ 2 - 1
drivers/infiniband/core/mad_priv.h

@@ -131,7 +131,8 @@ struct ib_mad_send_wr_private {
 	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
 	__be64 tid;
 	unsigned long timeout;
-	int retries;
+	int max_retries;
+	int retries_left;
 	int retry;
 	int refcount;
 	enum ib_wc_status status;

+ 1 - 1
drivers/infiniband/core/mad_rmpp.c

@@ -684,7 +684,7 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent,
 
 	if (seg_num > mad_send_wr->last_ack) {
 		adjust_last_ack(mad_send_wr, seg_num);
-		mad_send_wr->retries = mad_send_wr->send_buf.retries;
+		mad_send_wr->retries_left = mad_send_wr->max_retries;
 	}
 	mad_send_wr->newwin = newwin;
 	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {

+ 45 - 10
drivers/infiniband/core/multicast.c

@@ -73,11 +73,20 @@ struct mcast_device {
 };
 
 enum mcast_state {
-	MCAST_IDLE,
 	MCAST_JOINING,
 	MCAST_MEMBER,
+	MCAST_ERROR,
+};
+
+enum mcast_group_state {
+	MCAST_IDLE,
 	MCAST_BUSY,
-	MCAST_ERROR
+	MCAST_GROUP_ERROR,
+	MCAST_PKEY_EVENT
+};
+
+enum {
+	MCAST_INVALID_PKEY_INDEX = 0xFFFF
 };
 
 struct mcast_member;
@@ -93,9 +102,10 @@ struct mcast_group {
 	struct mcast_member	*last_join;
 	int			members[3];
 	atomic_t		refcount;
-	enum mcast_state	state;
+	enum mcast_group_state	state;
 	struct ib_sa_query	*query;
 	int			query_id;
+	u16			pkey_index;
 };
 
 struct mcast_member {
@@ -378,9 +388,19 @@ static int fail_join(struct mcast_group *group, struct mcast_member *member,
 static void process_group_error(struct mcast_group *group)
 {
 	struct mcast_member *member;
-	int ret;
+	int ret = 0;
+	u16 pkey_index;
+
+	if (group->state == MCAST_PKEY_EVENT)
+		ret = ib_find_pkey(group->port->dev->device,
+				   group->port->port_num,
+				   be16_to_cpu(group->rec.pkey), &pkey_index);
 
 	spin_lock_irq(&group->lock);
+	if (group->state == MCAST_PKEY_EVENT && !ret &&
+	    group->pkey_index == pkey_index)
+		goto out;
+
 	while (!list_empty(&group->active_list)) {
 		member = list_entry(group->active_list.next,
 				    struct mcast_member, list);
@@ -399,6 +419,7 @@ static void process_group_error(struct mcast_group *group)
 	}
 
 	group->rec.join_state = 0;
+out:
 	group->state = MCAST_BUSY;
 	spin_unlock_irq(&group->lock);
 }
@@ -415,9 +436,9 @@ static void mcast_work_handler(struct work_struct *work)
 retest:
 	spin_lock_irq(&group->lock);
 	while (!list_empty(&group->pending_list) ||
-	       (group->state == MCAST_ERROR)) {
+	       (group->state != MCAST_BUSY)) {
 
-		if (group->state == MCAST_ERROR) {
+		if (group->state != MCAST_BUSY) {
 			spin_unlock_irq(&group->lock);
 			process_group_error(group);
 			goto retest;
@@ -494,12 +515,19 @@ static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
 			 void *context)
 {
 	struct mcast_group *group = context;
+	u16 pkey_index = MCAST_INVALID_PKEY_INDEX;
 
 	if (status)
 		process_join_error(group, status);
 	else {
+		ib_find_pkey(group->port->dev->device, group->port->port_num,
+			     be16_to_cpu(rec->pkey), &pkey_index);
+
 		spin_lock_irq(&group->port->lock);
 		group->rec = *rec;
+		if (group->state == MCAST_BUSY &&
+		    group->pkey_index == MCAST_INVALID_PKEY_INDEX)
+			group->pkey_index = pkey_index;
 		if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) {
 			rb_erase(&group->node, &group->port->table);
 			mcast_insert(group->port, group, 1);
@@ -539,6 +567,7 @@ static struct mcast_group *acquire_group(struct mcast_port *port,
 
 	group->port = port;
 	group->rec.mgid = *mgid;
+	group->pkey_index = MCAST_INVALID_PKEY_INDEX;
 	INIT_LIST_HEAD(&group->pending_list);
 	INIT_LIST_HEAD(&group->active_list);
 	INIT_WORK(&group->work, mcast_work_handler);
@@ -707,7 +736,8 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
 }
 EXPORT_SYMBOL(ib_init_ah_from_mcmember);
 
-static void mcast_groups_lost(struct mcast_port *port)
+static void mcast_groups_event(struct mcast_port *port,
+			       enum mcast_group_state state)
 {
 	struct mcast_group *group;
 	struct rb_node *node;
@@ -721,7 +751,8 @@ static void mcast_groups_lost(struct mcast_port *port)
 			atomic_inc(&group->refcount);
 			queue_work(mcast_wq, &group->work);
 		}
-		group->state = MCAST_ERROR;
+		if (group->state != MCAST_GROUP_ERROR)
+			group->state = state;
 		spin_unlock(&group->lock);
 	}
 	spin_unlock_irqrestore(&port->lock, flags);
@@ -731,16 +762,20 @@ static void mcast_event_handler(struct ib_event_handler *handler,
 				struct ib_event *event)
 {
 	struct mcast_device *dev;
+	int index;
 
 	dev = container_of(handler, struct mcast_device, event_handler);
+	index = event->element.port_num - dev->start_port;
 
 	switch (event->event) {
 	case IB_EVENT_PORT_ERR:
 	case IB_EVENT_LID_CHANGE:
 	case IB_EVENT_SM_CHANGE:
 	case IB_EVENT_CLIENT_REREGISTER:
-		mcast_groups_lost(&dev->port[event->element.port_num -
-					     dev->start_port]);
+		mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
+		break;
+	case IB_EVENT_PKEY_CHANGE:
+		mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT);
 		break;
 	default:
 		break;

+ 17 - 1
drivers/infiniband/core/smi.h

@@ -59,7 +59,8 @@ extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
 					      u8 node_type, int port_num);
 
 /*
- * Return 1 if the SMP should be handled by the local SMA/SM via process_mad
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
  */
 static inline enum smi_action smi_check_local_smp(struct ib_smp *smp,
 						  struct ib_device *device)
@@ -71,4 +72,19 @@ static inline enum smi_action smi_check_local_smp(struct ib_smp *smp,
 		(smp->hop_ptr == smp->hop_cnt + 1)) ?
 		IB_SMI_HANDLE : IB_SMI_DISCARD);
 }
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_returning_smp(struct ib_smp *smp,
+						   struct ib_device *device)
+{
+	/* C14-13:3 -- We're at the end of the DR segment of path */
+	/* C14-13:4 -- Hop Pointer == 0 -> give to SM */
+	return ((device->process_mad &&
+		ib_get_smp_direction(smp) &&
+		!smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
 #endif	/* __SMI_H_ */

+ 15 - 22
drivers/infiniband/core/ucm.c

@@ -106,6 +106,9 @@ enum {
 	IB_UCM_MAX_DEVICES = 32
 };
 
+/* ib_cm and ib_user_cm modules share /sys/class/infiniband_cm */
+extern struct class cm_class;
+
 #define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
 
 static void ib_ucm_add_one(struct ib_device *device);
@@ -1199,7 +1202,7 @@ static int ib_ucm_close(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static void ib_ucm_release_class_dev(struct class_device *class_dev)
+static void ucm_release_class_dev(struct class_device *class_dev)
 {
 	struct ib_ucm_device *dev;
 
@@ -1217,11 +1220,6 @@ static const struct file_operations ucm_fops = {
 	.poll    = ib_ucm_poll,
 };
 
-static struct class ucm_class = {
-	.name    = "infiniband_cm",
-	.release = ib_ucm_release_class_dev
-};
-
 static ssize_t show_ibdev(struct class_device *class_dev, char *buf)
 {
 	struct ib_ucm_device *dev;
@@ -1257,9 +1255,10 @@ static void ib_ucm_add_one(struct ib_device *device)
 	if (cdev_add(&ucm_dev->dev, IB_UCM_BASE_DEV + ucm_dev->devnum, 1))
 		goto err;
 
-	ucm_dev->class_dev.class = &ucm_class;
+	ucm_dev->class_dev.class = &cm_class;
 	ucm_dev->class_dev.dev = device->dma_device;
 	ucm_dev->class_dev.devt = ucm_dev->dev.dev;
+	ucm_dev->class_dev.release = ucm_release_class_dev;
 	snprintf(ucm_dev->class_dev.class_id, BUS_ID_SIZE, "ucm%d",
 		 ucm_dev->devnum);
 	if (class_device_register(&ucm_dev->class_dev))
@@ -1306,40 +1305,34 @@ static int __init ib_ucm_init(void)
 				     "infiniband_cm");
 	if (ret) {
 		printk(KERN_ERR "ucm: couldn't register device number\n");
-		goto err;
+		goto error1;
 	}
 
-	ret = class_register(&ucm_class);
-	if (ret) {
-		printk(KERN_ERR "ucm: couldn't create class infiniband_cm\n");
-		goto err_chrdev;
-	}
-
-	ret = class_create_file(&ucm_class, &class_attr_abi_version);
+	ret = class_create_file(&cm_class, &class_attr_abi_version);
 	if (ret) {
 		printk(KERN_ERR "ucm: couldn't create abi_version attribute\n");
-		goto err_class;
+		goto error2;
 	}
 
 	ret = ib_register_client(&ucm_client);
 	if (ret) {
 		printk(KERN_ERR "ucm: couldn't register client\n");
-		goto err_class;
+		goto error3;
 	}
 	return 0;
 
-err_class:
-	class_unregister(&ucm_class);
-err_chrdev:
+error3:
+	class_remove_file(&cm_class, &class_attr_abi_version);
+error2:
 	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
-err:
+error1:
 	return ret;
 }
 
 static void __exit ib_ucm_cleanup(void)
 {
 	ib_unregister_client(&ucm_client);
-	class_unregister(&ucm_class);
+	class_remove_file(&cm_class, &class_attr_abi_version);
 	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
 	idr_destroy(&ctx_id_table);
 }

+ 92 - 0
drivers/infiniband/core/ucma.c

@@ -31,6 +31,7 @@
  */
 
 #include <linux/completion.h>
+#include <linux/file.h>
 #include <linux/mutex.h>
 #include <linux/poll.h>
 #include <linux/idr.h>
@@ -991,6 +992,96 @@ out:
 	return ret;
 }
 
+static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+	/* Acquire mutex's based on pointer comparison to prevent deadlock. */
+	if (file1 < file2) {
+		mutex_lock(&file1->mut);
+		mutex_lock(&file2->mut);
+	} else {
+		mutex_lock(&file2->mut);
+		mutex_lock(&file1->mut);
+	}
+}
+
+static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+	if (file1 < file2) {
+		mutex_unlock(&file2->mut);
+		mutex_unlock(&file1->mut);
+	} else {
+		mutex_unlock(&file1->mut);
+		mutex_unlock(&file2->mut);
+	}
+}
+
+static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file)
+{
+	struct ucma_event *uevent, *tmp;
+
+	list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list)
+		if (uevent->ctx == ctx)
+			list_move_tail(&uevent->list, &file->event_list);
+}
+
+static ssize_t ucma_migrate_id(struct ucma_file *new_file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_migrate_id cmd;
+	struct rdma_ucm_migrate_resp resp;
+	struct ucma_context *ctx;
+	struct file *filp;
+	struct ucma_file *cur_file;
+	int ret = 0;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	/* Get current fd to protect against it being closed */
+	filp = fget(cmd.fd);
+	if (!filp)
+		return -ENOENT;
+
+	/* Validate current fd and prevent destruction of id. */
+	ctx = ucma_get_ctx(filp->private_data, cmd.id);
+	if (IS_ERR(ctx)) {
+		ret = PTR_ERR(ctx);
+		goto file_put;
+	}
+
+	cur_file = ctx->file;
+	if (cur_file == new_file) {
+		resp.events_reported = ctx->events_reported;
+		goto response;
+	}
+
+	/*
+	 * Migrate events between fd's, maintaining order, and avoiding new
+	 * events being added before existing events.
+	 */
+	ucma_lock_files(cur_file, new_file);
+	mutex_lock(&mut);
+
+	list_move_tail(&ctx->list, &new_file->ctx_list);
+	ucma_move_events(ctx, new_file);
+	ctx->file = new_file;
+	resp.events_reported = ctx->events_reported;
+
+	mutex_unlock(&mut);
+	ucma_unlock_files(cur_file, new_file);
+
+response:
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	ucma_put_ctx(ctx);
+file_put:
+	fput(filp);
+	return ret;
+}
+
 static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
 				   const char __user *inbuf,
 				   int in_len, int out_len) = {
@@ -1012,6 +1103,7 @@ static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
 	[RDMA_USER_CM_CMD_NOTIFY]	= ucma_notify,
 	[RDMA_USER_CM_CMD_JOIN_MCAST]	= ucma_join_multicast,
 	[RDMA_USER_CM_CMD_LEAVE_MCAST]	= ucma_leave_multicast,
+	[RDMA_USER_CM_CMD_MIGRATE_ID]	= ucma_migrate_id
 };
 
 static ssize_t ucma_write(struct file *filp, const char __user *buf,

+ 53 - 62
drivers/infiniband/core/user_mad.c

@@ -2,6 +2,7 @@
  * Copyright (c) 2004 Topspin Communications.  All rights reserved.
  * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2008 Cisco. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -42,7 +43,7 @@
 #include <linux/cdev.h>
 #include <linux/dma-mapping.h>
 #include <linux/poll.h>
-#include <linux/rwsem.h>
+#include <linux/mutex.h>
 #include <linux/kref.h>
 #include <linux/compat.h>
 
@@ -94,7 +95,7 @@ struct ib_umad_port {
 	struct class_device   *sm_class_dev;
 	struct semaphore       sm_sem;
 
-	struct rw_semaphore    mutex;
+	struct mutex	       file_mutex;
 	struct list_head       file_list;
 
 	struct ib_device      *ib_dev;
@@ -110,11 +111,11 @@ struct ib_umad_device {
 };
 
 struct ib_umad_file {
+	struct mutex		mutex;
 	struct ib_umad_port    *port;
 	struct list_head	recv_list;
 	struct list_head	send_list;
 	struct list_head	port_list;
-	spinlock_t		recv_lock;
 	spinlock_t		send_lock;
 	wait_queue_head_t	recv_wait;
 	struct ib_mad_agent    *agent[IB_UMAD_MAX_AGENTS];
@@ -156,7 +157,7 @@ static int hdr_size(struct ib_umad_file *file)
 		sizeof (struct ib_user_mad_hdr_old);
 }
 
-/* caller must hold port->mutex at least for reading */
+/* caller must hold file->mutex */
 static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id)
 {
 	return file->agents_dead ? NULL : file->agent[id];
@@ -168,32 +169,30 @@ static int queue_packet(struct ib_umad_file *file,
 {
 	int ret = 1;
 
-	down_read(&file->port->mutex);
+	mutex_lock(&file->mutex);
 
 	for (packet->mad.hdr.id = 0;
 	     packet->mad.hdr.id < IB_UMAD_MAX_AGENTS;
 	     packet->mad.hdr.id++)
 		if (agent == __get_agent(file, packet->mad.hdr.id)) {
-			spin_lock_irq(&file->recv_lock);
 			list_add_tail(&packet->list, &file->recv_list);
-			spin_unlock_irq(&file->recv_lock);
 			wake_up_interruptible(&file->recv_wait);
 			ret = 0;
 			break;
 		}
 
-	up_read(&file->port->mutex);
+	mutex_unlock(&file->mutex);
 
 	return ret;
 }
 
 static void dequeue_send(struct ib_umad_file *file,
 			 struct ib_umad_packet *packet)
- {
+{
 	spin_lock_irq(&file->send_lock);
 	list_del(&packet->list);
 	spin_unlock_irq(&file->send_lock);
- }
+}
 
 static void send_handler(struct ib_mad_agent *agent,
 			 struct ib_mad_send_wc *send_wc)
@@ -341,10 +340,10 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf,
 	if (count < hdr_size(file))
 		return -EINVAL;
 
-	spin_lock_irq(&file->recv_lock);
+	mutex_lock(&file->mutex);
 
 	while (list_empty(&file->recv_list)) {
-		spin_unlock_irq(&file->recv_lock);
+		mutex_unlock(&file->mutex);
 
 		if (filp->f_flags & O_NONBLOCK)
 			return -EAGAIN;
@@ -353,13 +352,13 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf,
 					     !list_empty(&file->recv_list)))
 			return -ERESTARTSYS;
 
-		spin_lock_irq(&file->recv_lock);
+		mutex_lock(&file->mutex);
 	}
 
 	packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
 	list_del(&packet->list);
 
-	spin_unlock_irq(&file->recv_lock);
+	mutex_unlock(&file->mutex);
 
 	if (packet->recv_wc)
 		ret = copy_recv_mad(file, buf, packet, count);
@@ -368,9 +367,9 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf,
 
 	if (ret < 0) {
 		/* Requeue packet */
-		spin_lock_irq(&file->recv_lock);
+		mutex_lock(&file->mutex);
 		list_add(&packet->list, &file->recv_list);
-		spin_unlock_irq(&file->recv_lock);
+		mutex_unlock(&file->mutex);
 	} else {
 		if (packet->recv_wc)
 			ib_free_recv_mad(packet->recv_wc);
@@ -481,7 +480,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
 		goto err;
 	}
 
-	down_read(&file->port->mutex);
+	mutex_lock(&file->mutex);
 
 	agent = __get_agent(file, packet->mad.hdr.id);
 	if (!agent) {
@@ -577,7 +576,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
 	if (ret)
 		goto err_send;
 
-	up_read(&file->port->mutex);
+	mutex_unlock(&file->mutex);
 	return count;
 
 err_send:
@@ -587,7 +586,7 @@ err_msg:
 err_ah:
 	ib_destroy_ah(ah);
 err_up:
-	up_read(&file->port->mutex);
+	mutex_unlock(&file->mutex);
 err:
 	kfree(packet);
 	return ret;
@@ -613,11 +612,12 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
 {
 	struct ib_user_mad_reg_req ureq;
 	struct ib_mad_reg_req req;
-	struct ib_mad_agent *agent;
+	struct ib_mad_agent *agent = NULL;
 	int agent_id;
 	int ret;
 
-	down_write(&file->port->mutex);
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
 
 	if (!file->port->ib_dev) {
 		ret = -EPIPE;
@@ -666,13 +666,13 @@ found:
 				      send_handler, recv_handler, file);
 	if (IS_ERR(agent)) {
 		ret = PTR_ERR(agent);
+		agent = NULL;
 		goto out;
 	}
 
 	if (put_user(agent_id,
 		     (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
 		ret = -EFAULT;
-		ib_unregister_mad_agent(agent);
 		goto out;
 	}
 
@@ -690,7 +690,13 @@ found:
 	ret = 0;
 
 out:
-	up_write(&file->port->mutex);
+	mutex_unlock(&file->mutex);
+
+	if (ret && agent)
+		ib_unregister_mad_agent(agent);
+
+	mutex_unlock(&file->port->file_mutex);
+
 	return ret;
 }
 
@@ -703,7 +709,8 @@ static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
 	if (get_user(id, arg))
 		return -EFAULT;
 
-	down_write(&file->port->mutex);
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
 
 	if (id < 0 || id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) {
 		ret = -EINVAL;
@@ -714,11 +721,13 @@ static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
 	file->agent[id] = NULL;
 
 out:
-	up_write(&file->port->mutex);
+	mutex_unlock(&file->mutex);
 
 	if (agent)
 		ib_unregister_mad_agent(agent);
 
+	mutex_unlock(&file->port->file_mutex);
+
 	return ret;
 }
 
@@ -726,12 +735,12 @@ static long ib_umad_enable_pkey(struct ib_umad_file *file)
 {
 	int ret = 0;
 
-	down_write(&file->port->mutex);
+	mutex_lock(&file->mutex);
 	if (file->already_used)
 		ret = -EINVAL;
 	else
 		file->use_pkey_index = 1;
-	up_write(&file->port->mutex);
+	mutex_unlock(&file->mutex);
 
 	return ret;
 }
@@ -783,7 +792,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
 	if (!port)
 		return -ENXIO;
 
-	down_write(&port->mutex);
+	mutex_lock(&port->file_mutex);
 
 	if (!port->ib_dev) {
 		ret = -ENXIO;
@@ -797,7 +806,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
 		goto out;
 	}
 
-	spin_lock_init(&file->recv_lock);
+	mutex_init(&file->mutex);
 	spin_lock_init(&file->send_lock);
 	INIT_LIST_HEAD(&file->recv_list);
 	INIT_LIST_HEAD(&file->send_list);
@@ -809,7 +818,7 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
 	list_add_tail(&file->port_list, &port->file_list);
 
 out:
-	up_write(&port->mutex);
+	mutex_unlock(&port->file_mutex);
 	return ret;
 }
 
@@ -821,7 +830,8 @@ static int ib_umad_close(struct inode *inode, struct file *filp)
 	int already_dead;
 	int i;
 
-	down_write(&file->port->mutex);
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
 
 	already_dead = file->agents_dead;
 	file->agents_dead = 1;
@@ -834,14 +844,14 @@ static int ib_umad_close(struct inode *inode, struct file *filp)
 
 	list_del(&file->port_list);
 
-	downgrade_write(&file->port->mutex);
+	mutex_unlock(&file->mutex);
 
 	if (!already_dead)
 		for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
 			if (file->agent[i])
 				ib_unregister_mad_agent(file->agent[i]);
 
-	up_read(&file->port->mutex);
+	mutex_unlock(&file->port->file_mutex);
 
 	kfree(file);
 	kref_put(&dev->ref, ib_umad_release_dev);
@@ -914,10 +924,10 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp)
 	};
 	int ret = 0;
 
-	down_write(&port->mutex);
+	mutex_lock(&port->file_mutex);
 	if (port->ib_dev)
 		ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
-	up_write(&port->mutex);
+	mutex_unlock(&port->file_mutex);
 
 	up(&port->sm_sem);
 
@@ -981,7 +991,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num,
 	port->ib_dev   = device;
 	port->port_num = port_num;
 	init_MUTEX(&port->sm_sem);
-	init_rwsem(&port->mutex);
+	mutex_init(&port->file_mutex);
 	INIT_LIST_HEAD(&port->file_list);
 
 	port->dev = cdev_alloc();
@@ -1052,6 +1062,7 @@ err_cdev:
 static void ib_umad_kill_port(struct ib_umad_port *port)
 {
 	struct ib_umad_file *file;
+	int already_dead;
 	int id;
 
 	class_set_devdata(port->class_dev,    NULL);
@@ -1067,42 +1078,22 @@ static void ib_umad_kill_port(struct ib_umad_port *port)
 	umad_port[port->dev_num] = NULL;
 	spin_unlock(&port_lock);
 
-	down_write(&port->mutex);
+	mutex_lock(&port->file_mutex);
 
 	port->ib_dev = NULL;
 
-	/*
-	 * Now go through the list of files attached to this port and
-	 * unregister all of their MAD agents.  We need to hold
-	 * port->mutex while doing this to avoid racing with
-	 * ib_umad_close(), but we can't hold the mutex for writing
-	 * while calling ib_unregister_mad_agent(), since that might
-	 * deadlock by calling back into queue_packet().  So we
-	 * downgrade our lock to a read lock, and then drop and
-	 * reacquire the write lock for the next iteration.
-	 *
-	 * We do list_del_init() on the file's list_head so that the
-	 * list_del in ib_umad_close() is still OK, even after the
-	 * file is removed from the list.
-	 */
-	while (!list_empty(&port->file_list)) {
-		file = list_entry(port->file_list.next, struct ib_umad_file,
-				  port_list);
-
+	list_for_each_entry(file, &port->file_list, port_list) {
+		mutex_lock(&file->mutex);
+		already_dead = file->agents_dead;
 		file->agents_dead = 1;
-		list_del_init(&file->port_list);
-
-		downgrade_write(&port->mutex);
+		mutex_unlock(&file->mutex);
 
 		for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id)
 			if (file->agent[id])
 				ib_unregister_mad_agent(file->agent[id]);
-
-		up_read(&port->mutex);
-		down_write(&port->mutex);
 	}
 
-	up_write(&port->mutex);
+	mutex_unlock(&port->file_mutex);
 
 	clear_bit(port->dev_num, dev_map);
 }

+ 2 - 2
drivers/infiniband/hw/cxgb3/cxio_hal.c

@@ -179,7 +179,7 @@ int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
 	setup.size = 1UL << cq->size_log2;
 	setup.credits = 65535;
 	setup.credit_thres = 1;
-	if (rdev_p->t3cdev_p->type == T3B)
+	if (rdev_p->t3cdev_p->type != T3A)
 		setup.ovfl_mode = 0;
 	else
 		setup.ovfl_mode = 1;
@@ -584,7 +584,7 @@ static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
 {
 	u32 i, nr_wqe, copy_len;
 	u8 *copy_data;
-	u8 wr_len, utx_len;	/* lenght in 8 byte flit */
+	u8 wr_len, utx_len;	/* length in 8 byte flit */
 	enum t3_wr_flags flag;
 	__be64 *wqe;
 	u64 utx_cmd;

+ 3 - 2
drivers/infiniband/hw/cxgb3/cxio_wr.h

@@ -315,7 +315,7 @@ struct t3_rdma_init_wr {
 	__be32 ird;
 	__be64 qp_dma_addr;	/* 7 */
 	__be32 qp_dma_size;	/* 8 */
-	u32 irs;
+	__be32 irs;
 };
 
 struct t3_genbit {
@@ -324,7 +324,8 @@ struct t3_genbit {
 };
 
 enum rdma_init_wr_flags {
-	RECVS_POSTED = 1,
+	RECVS_POSTED = (1<<0),
+	PRIV_QP = (1<<1),
 };
 
 union t3_wr {

+ 2 - 2
drivers/infiniband/hw/cxgb3/iwch_cm.c

@@ -1118,7 +1118,7 @@ static int act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
 	     status2errno(rpl->status));
 	connect_reply_upcall(ep, status2errno(rpl->status));
 	state_set(&ep->com, DEAD);
-	if (ep->com.tdev->type == T3B && act_open_has_tid(rpl->status))
+	if (ep->com.tdev->type != T3A && act_open_has_tid(rpl->status))
 		release_tid(ep->com.tdev, GET_TID(rpl), NULL);
 	cxgb3_free_atid(ep->com.tdev, ep->atid);
 	dst_release(ep->dst);
@@ -1249,7 +1249,7 @@ static void reject_cr(struct t3cdev *tdev, u32 hwtid, __be32 peer_ip,
 	skb_trim(skb, sizeof(struct cpl_tid_release));
 	skb_get(skb);
 
-	if (tdev->type == T3B)
+	if (tdev->type != T3A)
 		release_tid(tdev, hwtid, skb);
 	else {
 		struct cpl_pass_accept_rpl *rpl;

+ 7 - 0
drivers/infiniband/hw/cxgb3/iwch_mem.c

@@ -122,6 +122,13 @@ int build_phys_page_list(struct ib_phys_buf *buffer_list,
 		*total_size += buffer_list[i].size;
 		if (i > 0)
 			mask |= buffer_list[i].addr;
+		else
+			mask |= buffer_list[i].addr & PAGE_MASK;
+		if (i != num_phys_buf - 1)
+			mask |= buffer_list[i].addr + buffer_list[i].size;
+		else
+			mask |= (buffer_list[i].addr + buffer_list[i].size +
+				PAGE_SIZE - 1) & PAGE_MASK;
 	}
 
 	if (*total_size > 0xFFFFFFFFULL)

+ 6 - 1
drivers/infiniband/hw/cxgb3/iwch_provider.c

@@ -39,6 +39,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -645,7 +646,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	if (err)
 		goto err;
 
-	if (udata && t3b_device(rhp)) {
+	if (udata && !t3a_device(rhp)) {
 		uresp.pbl_addr = (mhp->attr.pbl_addr -
 	                         rhp->rdev.rnic_info.pbl_base) >> 3;
 		PDBG("%s user resp pbl_addr 0x%x\n", __FUNCTION__,
@@ -1053,7 +1054,9 @@ static ssize_t show_fw_ver(struct class_device *cdev, char *buf)
 	struct net_device *lldev = dev->rdev.t3cdev_p->lldev;
 
 	PDBG("%s class dev 0x%p\n", __FUNCTION__, cdev);
+	rtnl_lock();
 	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	rtnl_unlock();
 	return sprintf(buf, "%s\n", info.fw_version);
 }
 
@@ -1065,7 +1068,9 @@ static ssize_t show_hca(struct class_device *cdev, char *buf)
 	struct net_device *lldev = dev->rdev.t3cdev_p->lldev;
 
 	PDBG("%s class dev 0x%p\n", __FUNCTION__, cdev);
+	rtnl_lock();
 	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	rtnl_unlock();
 	return sprintf(buf, "%s\n", info.driver);
 }
 

+ 8 - 21
drivers/infiniband/hw/cxgb3/iwch_qp.c

@@ -208,36 +208,19 @@ static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list,
 static int iwch_build_rdma_recv(struct iwch_dev *rhp, union t3_wr *wqe,
 				struct ib_recv_wr *wr)
 {
-	int i, err = 0;
-	u32 pbl_addr[4];
-	u8 page_size[4];
+	int i;
 	if (wr->num_sge > T3_MAX_SGE)
 		return -EINVAL;
-	err = iwch_sgl2pbl_map(rhp, wr->sg_list, wr->num_sge, pbl_addr,
-			       page_size);
-	if (err)
-		return err;
-	wqe->recv.pagesz[0] = page_size[0];
-	wqe->recv.pagesz[1] = page_size[1];
-	wqe->recv.pagesz[2] = page_size[2];
-	wqe->recv.pagesz[3] = page_size[3];
 	wqe->recv.num_sgle = cpu_to_be32(wr->num_sge);
 	for (i = 0; i < wr->num_sge; i++) {
 		wqe->recv.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey);
 		wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
-
-		/* to in the WQE == the offset into the page */
-		wqe->recv.sgl[i].to = cpu_to_be64(((u32) wr->sg_list[i].addr) %
-				(1UL << (12 + page_size[i])));
-
-		/* pbl_addr is the adapters address in the PBL */
-		wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_addr[i]);
+		wqe->recv.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
 	}
 	for (; i < T3_MAX_SGE; i++) {
 		wqe->recv.sgl[i].stag = 0;
 		wqe->recv.sgl[i].len = 0;
 		wqe->recv.sgl[i].to = 0;
-		wqe->recv.pbl_addr[i] = 0;
 	}
 	return 0;
 }
@@ -659,6 +642,7 @@ static void __flush_qp(struct iwch_qp *qhp, unsigned long *flag)
 	cxio_flush_rq(&qhp->wq, &rchp->cq, count);
 	spin_unlock(&qhp->lock);
 	spin_unlock_irqrestore(&rchp->lock, *flag);
+	(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
 
 	/* locking heirarchy: cq lock first, then qp lock. */
 	spin_lock_irqsave(&schp->lock, *flag);
@@ -668,6 +652,7 @@ static void __flush_qp(struct iwch_qp *qhp, unsigned long *flag)
 	cxio_flush_sq(&qhp->wq, &schp->cq, count);
 	spin_unlock(&qhp->lock);
 	spin_unlock_irqrestore(&schp->lock, *flag);
+	(*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context);
 
 	/* deref */
 	if (atomic_dec_and_test(&qhp->refcnt))
@@ -678,7 +663,7 @@ static void __flush_qp(struct iwch_qp *qhp, unsigned long *flag)
 
 static void flush_qp(struct iwch_qp *qhp, unsigned long *flag)
 {
-	if (t3b_device(qhp->rhp))
+	if (qhp->ibqp.uobject)
 		cxio_set_wq_in_error(&qhp->wq);
 	else
 		__flush_qp(qhp, flag);
@@ -732,6 +717,7 @@ static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp,
 	init_attr.qp_dma_addr = qhp->wq.dma_addr;
 	init_attr.qp_dma_size = (1UL << qhp->wq.size_log2);
 	init_attr.flags = rqes_posted(qhp) ? RECVS_POSTED : 0;
+	init_attr.flags |= capable(CAP_NET_BIND_SERVICE) ? PRIV_QP : 0;
 	init_attr.irs = qhp->ep->rcv_seq;
 	PDBG("%s init_attr.rq_addr 0x%x init_attr.rq_size = %d "
 	     "flags 0x%x qpcaps 0x%x\n", __FUNCTION__,
@@ -847,10 +833,11 @@ int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp,
 				disconnect = 1;
 				ep = qhp->ep;
 			}
+			flush_qp(qhp, &flag);
 			break;
 		case IWCH_QP_STATE_TERMINATE:
 			qhp->attr.state = IWCH_QP_STATE_TERMINATE;
-			if (t3b_device(qhp->rhp))
+			if (qhp->ibqp.uobject)
 				cxio_set_wq_in_error(&qhp->wq);
 			if (!internal)
 				terminate = 1;

+ 1 - 1
drivers/infiniband/hw/ehca/ehca_av.c

@@ -1,7 +1,7 @@
 /*
  *  IBM eServer eHCA Infiniband device driver for Linux on POWER
  *
- *  adress vector functions
+ *  address vector functions
  *
  *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
  *           Khadija Souissi <souissik@de.ibm.com>

+ 22 - 1
drivers/infiniband/hw/ehca/ehca_classes.h

@@ -94,7 +94,11 @@ struct ehca_sma_attr {
 
 struct ehca_sport {
 	struct ib_cq *ibcq_aqp1;
-	struct ib_qp *ibqp_aqp1;
+	struct ib_qp *ibqp_sqp[2];
+	/* lock to serialze modify_qp() calls for sqp in normal
+	 * and irq path (when event PORT_ACTIVE is received first time)
+	 */
+	spinlock_t mod_sqp_lock;
 	enum ib_port_state port_state;
 	struct ehca_sma_attr saved_attr;
 };
@@ -141,6 +145,14 @@ enum ehca_ext_qp_type {
 	EQPT_SRQ       = 3,
 };
 
+/* struct to cache modify_qp()'s parms for GSI/SMI qp */
+struct ehca_mod_qp_parm {
+	int mask;
+	struct ib_qp_attr attr;
+};
+
+#define EHCA_MOD_QP_PARM_MAX 4
+
 struct ehca_qp {
 	union {
 		struct ib_qp ib_qp;
@@ -164,10 +176,18 @@ struct ehca_qp {
 	struct ehca_cq *recv_cq;
 	unsigned int sqerr_purgeflag;
 	struct hlist_node list_entries;
+	/* array to cache modify_qp()'s parms for GSI/SMI qp */
+	struct ehca_mod_qp_parm *mod_qp_parm;
+	int mod_qp_parm_idx;
 	/* mmap counter for resources mapped into user space */
 	u32 mm_count_squeue;
 	u32 mm_count_rqueue;
 	u32 mm_count_galpa;
+	/* unsolicited ack circumvention */
+	int unsol_ack_circ;
+	int mtu_shift;
+	u32 message_count;
+	u32 packet_count;
 };
 
 #define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ)
@@ -323,6 +343,7 @@ extern int ehca_port_act_time;
 extern int ehca_use_hp_mr;
 extern int ehca_scaling_code;
 extern int ehca_lock_hcalls;
+extern int ehca_nr_ports;
 
 struct ipzu_queue_resp {
 	u32 qe_size;      /* queue entry size */

+ 1 - 1
drivers/infiniband/hw/ehca/ehca_cq.c

@@ -246,7 +246,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector,
 		} else {
 			if (h_ret != H_PAGE_REGISTERED) {
 				ehca_err(device, "Registration of page failed "
-					 "ehca_cq=%p cq_num=%x h_ret=%li"
+					 "ehca_cq=%p cq_num=%x h_ret=%li "
 					 "counter=%i act_pages=%i",
 					 my_cq, my_cq->cq_number,
 					 h_ret, counter, param.act_pages);

+ 33 - 5
drivers/infiniband/hw/ehca/ehca_irq.c

@@ -62,6 +62,7 @@
 #define NEQE_PORT_NUMBER       EHCA_BMASK_IBM( 8, 15)
 #define NEQE_PORT_AVAILABILITY EHCA_BMASK_IBM(16, 16)
 #define NEQE_DISRUPTIVE        EHCA_BMASK_IBM(16, 16)
+#define NEQE_SPECIFIC_EVENT    EHCA_BMASK_IBM(16, 23)
 
 #define ERROR_DATA_LENGTH      EHCA_BMASK_IBM(52, 63)
 #define ERROR_DATA_TYPE        EHCA_BMASK_IBM( 0,  7)
@@ -354,17 +355,34 @@ static void parse_ec(struct ehca_shca *shca, u64 eqe)
 {
 	u8 ec   = EHCA_BMASK_GET(NEQE_EVENT_CODE, eqe);
 	u8 port = EHCA_BMASK_GET(NEQE_PORT_NUMBER, eqe);
+	u8 spec_event;
+	struct ehca_sport *sport = &shca->sport[port - 1];
+	unsigned long flags;
 
 	switch (ec) {
 	case 0x30: /* port availability change */
 		if (EHCA_BMASK_GET(NEQE_PORT_AVAILABILITY, eqe)) {
-			shca->sport[port - 1].port_state = IB_PORT_ACTIVE;
+			int suppress_event;
+			/* replay modify_qp for sqps */
+			spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+			suppress_event = !sport->ibqp_sqp[IB_QPT_GSI];
+			if (sport->ibqp_sqp[IB_QPT_SMI])
+				ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_SMI]);
+			if (!suppress_event)
+				ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_GSI]);
+			spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+
+			/* AQP1 was destroyed, ignore this event */
+			if (suppress_event)
+				break;
+
+			sport->port_state = IB_PORT_ACTIVE;
 			dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
 					    "is active");
 			ehca_query_sma_attr(shca, port,
-					    &shca->sport[port - 1].saved_attr);
+					    &sport->saved_attr);
 		} else {
-			shca->sport[port - 1].port_state = IB_PORT_DOWN;
+			sport->port_state = IB_PORT_DOWN;
 			dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
 					    "is inactive");
 		}
@@ -378,11 +396,11 @@ static void parse_ec(struct ehca_shca *shca, u64 eqe)
 			ehca_warn(&shca->ib_device, "disruptive port "
 				  "%d configuration change", port);
 
-			shca->sport[port - 1].port_state = IB_PORT_DOWN;
+			sport->port_state = IB_PORT_DOWN;
 			dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
 					    "is inactive");
 
-			shca->sport[port - 1].port_state = IB_PORT_ACTIVE;
+			sport->port_state = IB_PORT_ACTIVE;
 			dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
 					    "is active");
 		} else
@@ -394,6 +412,16 @@ static void parse_ec(struct ehca_shca *shca, u64 eqe)
 	case 0x33:  /* trace stopped */
 		ehca_err(&shca->ib_device, "Traced stopped.");
 		break;
+	case 0x34: /* util async event */
+		spec_event = EHCA_BMASK_GET(NEQE_SPECIFIC_EVENT, eqe);
+		if (spec_event == 0x80) /* client reregister required */
+			dispatch_port_event(shca, port,
+					    IB_EVENT_CLIENT_REREGISTER,
+					    "client reregister req.");
+		else
+			ehca_warn(&shca->ib_device, "Unknown util async "
+				  "event %x on port %x", spec_event, port);
+		break;
 	default:
 		ehca_err(&shca->ib_device, "Unknown event code: %x on %s.",
 			 ec, shca->ib_device.name);

+ 2 - 0
drivers/infiniband/hw/ehca/ehca_iverbs.h

@@ -200,4 +200,6 @@ void ehca_free_fw_ctrlblock(void *ptr);
 #define ehca_free_fw_ctrlblock(ptr) free_page((unsigned long)(ptr))
 #endif
 
+void ehca_recover_sqp(struct ib_qp *sqp);
+
 #endif

+ 9 - 6
drivers/infiniband/hw/ehca/ehca_main.c

@@ -90,7 +90,8 @@ MODULE_PARM_DESC(hw_level,
 		 "hardware level"
 		 " (0: autosensing (default), 1: v. 0.20, 2: v. 0.21)");
 MODULE_PARM_DESC(nr_ports,
-		 "number of connected ports (default: 2)");
+		 "number of connected ports (-1: autodetect, 1: port one only, "
+		 "2: two ports (default)");
 MODULE_PARM_DESC(use_hp_mr,
 		 "high performance MRs (0: no (default), 1: yes)");
 MODULE_PARM_DESC(port_act_time,
@@ -511,7 +512,7 @@ static int ehca_create_aqp1(struct ehca_shca *shca, u32 port)
 	}
 	sport->ibcq_aqp1 = ibcq;
 
-	if (sport->ibqp_aqp1) {
+	if (sport->ibqp_sqp[IB_QPT_GSI]) {
 		ehca_err(&shca->ib_device, "AQP1 QP is already created.");
 		ret = -EPERM;
 		goto create_aqp1;
@@ -537,7 +538,7 @@ static int ehca_create_aqp1(struct ehca_shca *shca, u32 port)
 		ret = PTR_ERR(ibqp);
 		goto create_aqp1;
 	}
-	sport->ibqp_aqp1 = ibqp;
+	sport->ibqp_sqp[IB_QPT_GSI] = ibqp;
 
 	return 0;
 
@@ -550,7 +551,7 @@ static int ehca_destroy_aqp1(struct ehca_sport *sport)
 {
 	int ret;
 
-	ret = ib_destroy_qp(sport->ibqp_aqp1);
+	ret = ib_destroy_qp(sport->ibqp_sqp[IB_QPT_GSI]);
 	if (ret) {
 		ehca_gen_err("Cannot destroy AQP1 QP. ret=%i", ret);
 		return ret;
@@ -693,7 +694,7 @@ static int __devinit ehca_probe(struct of_device *dev,
 	struct ehca_shca *shca;
 	const u64 *handle;
 	struct ib_pd *ibpd;
-	int ret;
+	int ret, i;
 
 	handle = of_get_property(dev->node, "ibm,hca-handle", NULL);
 	if (!handle) {
@@ -714,6 +715,8 @@ static int __devinit ehca_probe(struct of_device *dev,
 		return -ENOMEM;
 	}
 	mutex_init(&shca->modify_mutex);
+	for (i = 0; i < ARRAY_SIZE(shca->sport); i++)
+		spin_lock_init(&shca->sport[i].mod_sqp_lock);
 
 	shca->ofdev = dev;
 	shca->ipz_hca_handle.handle = *handle;
@@ -934,7 +937,7 @@ void ehca_poll_eqs(unsigned long data)
 				ehca_process_eq(shca, 0);
 		}
 	}
-	mod_timer(&poll_eqs_timer, jiffies + HZ);
+	mod_timer(&poll_eqs_timer, round_jiffies(jiffies + HZ));
 	spin_unlock(&shca_list_lock);
 }
 

+ 169 - 11
drivers/infiniband/hw/ehca/ehca_qp.c

@@ -592,10 +592,8 @@ static struct ehca_qp *internal_create_qp(
 		goto create_qp_exit1;
 	}
 
-	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
-		parms.sigtype = HCALL_SIGT_EVERY;
-	else
-		parms.sigtype = HCALL_SIGT_BY_WQE;
+	/* Always signal by WQE so we can hide circ. WQEs */
+	parms.sigtype = HCALL_SIGT_BY_WQE;
 
 	/* UD_AV CIRCUMVENTION */
 	max_send_sge = init_attr->cap.max_send_sge;
@@ -618,6 +616,10 @@ static struct ehca_qp *internal_create_qp(
 	parms.squeue.max_sge = max_send_sge;
 	parms.rqueue.max_sge = max_recv_sge;
 
+	/* RC QPs need one more SWQE for unsolicited ack circumvention */
+	if (qp_type == IB_QPT_RC)
+		parms.squeue.max_wr++;
+
 	if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) {
 		if (HAS_SQ(my_qp))
 			ehca_determine_small_queue(
@@ -650,6 +652,8 @@ static struct ehca_qp *internal_create_qp(
 			parms.squeue.act_nr_sges = 1;
 			parms.rqueue.act_nr_sges = 1;
 		}
+		/* hide the extra WQE */
+		parms.squeue.act_nr_wqes--;
 		break;
 	case IB_QPT_UD:
 	case IB_QPT_GSI:
@@ -729,12 +733,31 @@ static struct ehca_qp *internal_create_qp(
 	init_attr->cap.max_send_wr = parms.squeue.act_nr_wqes;
 	my_qp->init_attr = *init_attr;
 
+	if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
+		shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
+			&my_qp->ib_qp;
+		if (ehca_nr_ports < 0) {
+			/* alloc array to cache subsequent modify qp parms
+			 * for autodetect mode
+			 */
+			my_qp->mod_qp_parm =
+				kzalloc(EHCA_MOD_QP_PARM_MAX *
+					sizeof(*my_qp->mod_qp_parm),
+					GFP_KERNEL);
+			if (!my_qp->mod_qp_parm) {
+				ehca_err(pd->device,
+					 "Could not alloc mod_qp_parm");
+				goto create_qp_exit4;
+			}
+		}
+	}
+
 	/* NOTE: define_apq0() not supported yet */
 	if (qp_type == IB_QPT_GSI) {
 		h_ret = ehca_define_sqp(shca, my_qp, init_attr);
 		if (h_ret != H_SUCCESS) {
 			ret = ehca2ib_return_code(h_ret);
-			goto create_qp_exit4;
+			goto create_qp_exit5;
 		}
 	}
 
@@ -743,7 +766,7 @@ static struct ehca_qp *internal_create_qp(
 		if (ret) {
 			ehca_err(pd->device,
 				 "Couldn't assign qp to send_cq ret=%i", ret);
-			goto create_qp_exit4;
+			goto create_qp_exit5;
 		}
 	}
 
@@ -769,12 +792,18 @@ static struct ehca_qp *internal_create_qp(
 		if (ib_copy_to_udata(udata, &resp, sizeof resp)) {
 			ehca_err(pd->device, "Copy to udata failed");
 			ret = -EINVAL;
-			goto create_qp_exit4;
+			goto create_qp_exit6;
 		}
 	}
 
 	return my_qp;
 
+create_qp_exit6:
+	ehca_cq_unassign_qp(my_qp->send_cq, my_qp->real_qp_num);
+
+create_qp_exit5:
+	kfree(my_qp->mod_qp_parm);
+
 create_qp_exit4:
 	if (HAS_RQ(my_qp))
 		ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
@@ -858,7 +887,7 @@ struct ib_srq *ehca_create_srq(struct ib_pd *pd,
 				update_mask,
 				mqpcb, my_qp->galpas.kernel);
 	if (hret != H_SUCCESS) {
-		ehca_err(pd->device, "Could not modify SRQ to INIT"
+		ehca_err(pd->device, "Could not modify SRQ to INIT "
 			 "ehca_qp=%p qp_num=%x h_ret=%li",
 			 my_qp, my_qp->real_qp_num, hret);
 		goto create_srq2;
@@ -872,7 +901,7 @@ struct ib_srq *ehca_create_srq(struct ib_pd *pd,
 				update_mask,
 				mqpcb, my_qp->galpas.kernel);
 	if (hret != H_SUCCESS) {
-		ehca_err(pd->device, "Could not enable SRQ"
+		ehca_err(pd->device, "Could not enable SRQ "
 			 "ehca_qp=%p qp_num=%x h_ret=%li",
 			 my_qp, my_qp->real_qp_num, hret);
 		goto create_srq2;
@@ -886,7 +915,7 @@ struct ib_srq *ehca_create_srq(struct ib_pd *pd,
 				update_mask,
 				mqpcb, my_qp->galpas.kernel);
 	if (hret != H_SUCCESS) {
-		ehca_err(pd->device, "Could not modify SRQ to RTR"
+		ehca_err(pd->device, "Could not modify SRQ to RTR "
 			 "ehca_qp=%p qp_num=%x h_ret=%li",
 			 my_qp, my_qp->real_qp_num, hret);
 		goto create_srq2;
@@ -992,7 +1021,7 @@ static int internal_modify_qp(struct ib_qp *ibqp,
 	unsigned long flags = 0;
 
 	/* do query_qp to obtain current attr values */
-	mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	mqpcb = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
 	if (!mqpcb) {
 		ehca_err(ibqp->device, "Could not get zeroed page for mqpcb "
 			 "ehca_qp=%p qp_num=%x ", my_qp, ibqp->qp_num);
@@ -1180,6 +1209,8 @@ static int internal_modify_qp(struct ib_qp *ibqp,
 		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_P_KEY_IDX, 1);
 	}
 	if (attr_mask & IB_QP_PORT) {
+		struct ehca_sport *sport;
+		struct ehca_qp *aqp1;
 		if (attr->port_num < 1 || attr->port_num > shca->num_ports) {
 			ret = -EINVAL;
 			ehca_err(ibqp->device, "Invalid port=%x. "
@@ -1188,6 +1219,29 @@ static int internal_modify_qp(struct ib_qp *ibqp,
 				 shca->num_ports);
 			goto modify_qp_exit2;
 		}
+		sport = &shca->sport[attr->port_num - 1];
+		if (!sport->ibqp_sqp[IB_QPT_GSI]) {
+			/* should not occur */
+			ret = -EFAULT;
+			ehca_err(ibqp->device, "AQP1 was not created for "
+				 "port=%x", attr->port_num);
+			goto modify_qp_exit2;
+		}
+		aqp1 = container_of(sport->ibqp_sqp[IB_QPT_GSI],
+				    struct ehca_qp, ib_qp);
+		if (ibqp->qp_type != IB_QPT_GSI &&
+		    ibqp->qp_type != IB_QPT_SMI &&
+		    aqp1->mod_qp_parm) {
+			/*
+			 * firmware will reject this modify_qp() because
+			 * port is not activated/initialized fully
+			 */
+			ret = -EFAULT;
+			ehca_warn(ibqp->device, "Couldn't modify qp port=%x: "
+				  "either port is being activated (try again) "
+				  "or cabling issue", attr->port_num);
+			goto modify_qp_exit2;
+		}
 		mqpcb->prim_phys_port = attr->port_num;
 		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_PHYS_PORT, 1);
 	}
@@ -1244,6 +1298,8 @@ static int internal_modify_qp(struct ib_qp *ibqp,
 	}
 
 	if (attr_mask & IB_QP_PATH_MTU) {
+		/* store ld(MTU) */
+		my_qp->mtu_shift = attr->path_mtu + 7;
 		mqpcb->path_mtu = attr->path_mtu;
 		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1);
 	}
@@ -1467,6 +1523,8 @@ modify_qp_exit1:
 int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
 		   struct ib_udata *udata)
 {
+	struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
+					      ib_device);
 	struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
 	struct ehca_pd *my_pd = container_of(my_qp->ib_qp.pd, struct ehca_pd,
 					     ib_pd);
@@ -1479,9 +1537,100 @@ int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
 		return -EINVAL;
 	}
 
+	/* The if-block below caches qp_attr to be modified for GSI and SMI
+	 * qps during the initialization by ib_mad. When the respective port
+	 * is activated, ie we got an event PORT_ACTIVE, we'll replay the
+	 * cached modify calls sequence, see ehca_recover_sqs() below.
+	 * Why that is required:
+	 * 1) If one port is connected, older code requires that port one
+	 *    to be connected and module option nr_ports=1 to be given by
+	 *    user, which is very inconvenient for end user.
+	 * 2) Firmware accepts modify_qp() only if respective port has become
+	 *    active. Older code had a wait loop of 30sec create_qp()/
+	 *    define_aqp1(), which is not appropriate in practice. This
+	 *    code now removes that wait loop, see define_aqp1(), and always
+	 *    reports all ports to ib_mad resp. users. Only activated ports
+	 *    will then usable for the users.
+	 */
+	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) {
+		int port = my_qp->init_attr.port_num;
+		struct ehca_sport *sport = &shca->sport[port - 1];
+		unsigned long flags;
+		spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+		/* cache qp_attr only during init */
+		if (my_qp->mod_qp_parm) {
+			struct ehca_mod_qp_parm *p;
+			if (my_qp->mod_qp_parm_idx >= EHCA_MOD_QP_PARM_MAX) {
+				ehca_err(&shca->ib_device,
+					 "mod_qp_parm overflow state=%x port=%x"
+					 " type=%x", attr->qp_state,
+					 my_qp->init_attr.port_num,
+					 ibqp->qp_type);
+				spin_unlock_irqrestore(&sport->mod_sqp_lock,
+						       flags);
+				return -EINVAL;
+			}
+			p = &my_qp->mod_qp_parm[my_qp->mod_qp_parm_idx];
+			p->mask = attr_mask;
+			p->attr = *attr;
+			my_qp->mod_qp_parm_idx++;
+			ehca_dbg(&shca->ib_device,
+				 "Saved qp_attr for state=%x port=%x type=%x",
+				 attr->qp_state, my_qp->init_attr.port_num,
+				 ibqp->qp_type);
+			spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+			return 0;
+		}
+		spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+	}
+
 	return internal_modify_qp(ibqp, attr, attr_mask, 0);
 }
 
+void ehca_recover_sqp(struct ib_qp *sqp)
+{
+	struct ehca_qp *my_sqp = container_of(sqp, struct ehca_qp, ib_qp);
+	int port = my_sqp->init_attr.port_num;
+	struct ib_qp_attr attr;
+	struct ehca_mod_qp_parm *qp_parm;
+	int i, qp_parm_idx, ret;
+	unsigned long flags, wr_cnt;
+
+	if (!my_sqp->mod_qp_parm)
+		return;
+	ehca_dbg(sqp->device, "SQP port=%x qp_num=%x", port, sqp->qp_num);
+
+	qp_parm = my_sqp->mod_qp_parm;
+	qp_parm_idx = my_sqp->mod_qp_parm_idx;
+	for (i = 0; i < qp_parm_idx; i++) {
+		attr = qp_parm[i].attr;
+		ret = internal_modify_qp(sqp, &attr, qp_parm[i].mask, 0);
+		if (ret) {
+			ehca_err(sqp->device, "Could not modify SQP port=%x "
+				 "qp_num=%x ret=%x", port, sqp->qp_num, ret);
+			goto free_qp_parm;
+		}
+		ehca_dbg(sqp->device, "SQP port=%x qp_num=%x in state=%x",
+			 port, sqp->qp_num, attr.qp_state);
+	}
+
+	/* re-trigger posted recv wrs */
+	wr_cnt =  my_sqp->ipz_rqueue.current_q_offset /
+		my_sqp->ipz_rqueue.qe_size;
+	if (wr_cnt) {
+		spin_lock_irqsave(&my_sqp->spinlock_r, flags);
+		hipz_update_rqa(my_sqp, wr_cnt);
+		spin_unlock_irqrestore(&my_sqp->spinlock_r, flags);
+		ehca_dbg(sqp->device, "doorbell port=%x qp_num=%x wr_cnt=%lx",
+			 port, sqp->qp_num, wr_cnt);
+	}
+
+free_qp_parm:
+	kfree(qp_parm);
+	/* this prevents subsequent calls to modify_qp() to cache qp_attr */
+	my_sqp->mod_qp_parm = NULL;
+}
+
 int ehca_query_qp(struct ib_qp *qp,
 		  struct ib_qp_attr *qp_attr,
 		  int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
@@ -1769,6 +1918,7 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
 	struct ehca_shca *shca = container_of(dev, struct ehca_shca, ib_device);
 	struct ehca_pd *my_pd = container_of(my_qp->ib_qp.pd, struct ehca_pd,
 					     ib_pd);
+	struct ehca_sport *sport = &shca->sport[my_qp->init_attr.port_num - 1];
 	u32 cur_pid = current->tgid;
 	u32 qp_num = my_qp->real_qp_num;
 	int ret;
@@ -1815,6 +1965,14 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
 	port_num = my_qp->init_attr.port_num;
 	qp_type  = my_qp->init_attr.qp_type;
 
+	if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
+		spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+		kfree(my_qp->mod_qp_parm);
+		my_qp->mod_qp_parm = NULL;
+		shca->sport[port_num - 1].ibqp_sqp[qp_type] = NULL;
+		spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+	}
+
 	/* no support for IB_QPT_SMI yet */
 	if (qp_type == IB_QPT_GSI) {
 		struct ib_event event;

+ 80 - 32
drivers/infiniband/hw/ehca/ehca_reqs.c

@@ -50,6 +50,9 @@
 #include "hcp_if.h"
 #include "hipz_fns.h"
 
+/* in RC traffic, insert an empty RDMA READ every this many packets */
+#define ACK_CIRC_THRESHOLD 2000000
+
 static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
 				  struct ehca_wqe *wqe_p,
 				  struct ib_recv_wr *recv_wr)
@@ -81,7 +84,7 @@ static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
 	if (ehca_debug_level) {
 		ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p",
 			     ipz_rqueue);
-		ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe");
+		ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe");
 	}
 
 	return 0;
@@ -135,7 +138,8 @@ static void trace_send_wr_ud(const struct ib_send_wr *send_wr)
 
 static inline int ehca_write_swqe(struct ehca_qp *qp,
 				  struct ehca_wqe *wqe_p,
-				  const struct ib_send_wr *send_wr)
+				  const struct ib_send_wr *send_wr,
+				  int hidden)
 {
 	u32 idx;
 	u64 dma_length;
@@ -176,7 +180,9 @@ static inline int ehca_write_swqe(struct ehca_qp *qp,
 
 	wqe_p->wr_flag = 0;
 
-	if (send_wr->send_flags & IB_SEND_SIGNALED)
+	if ((send_wr->send_flags & IB_SEND_SIGNALED ||
+	    qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR)
+	    && !hidden)
 		wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM;
 
 	if (send_wr->opcode == IB_WR_SEND_WITH_IMM ||
@@ -199,7 +205,7 @@ static inline int ehca_write_swqe(struct ehca_qp *qp,
 
 		wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8;
 		wqe_p->local_ee_context_qkey = remote_qkey;
-		if (!send_wr->wr.ud.ah) {
+		if (unlikely(!send_wr->wr.ud.ah)) {
 			ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp);
 			return -EINVAL;
 		}
@@ -255,6 +261,15 @@ static inline int ehca_write_swqe(struct ehca_qp *qp,
 		} /* eof idx */
 		wqe_p->u.nud.atomic_1st_op_dma_len = dma_length;
 
+		/* unsolicited ack circumvention */
+		if (send_wr->opcode == IB_WR_RDMA_READ) {
+			/* on RDMA read, switch on and reset counters */
+			qp->message_count = qp->packet_count = 0;
+			qp->unsol_ack_circ = 1;
+		} else
+			/* else estimate #packets */
+			qp->packet_count += (dma_length >> qp->mtu_shift) + 1;
+
 		break;
 
 	default:
@@ -355,13 +370,49 @@ static inline void map_ib_wc_status(u32 cqe_status,
 		*wc_status = IB_WC_SUCCESS;
 }
 
+static inline int post_one_send(struct ehca_qp *my_qp,
+			 struct ib_send_wr *cur_send_wr,
+			 struct ib_send_wr **bad_send_wr,
+			 int hidden)
+{
+	struct ehca_wqe *wqe_p;
+	int ret;
+	u64 start_offset = my_qp->ipz_squeue.current_q_offset;
+
+	/* get pointer next to free WQE */
+	wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
+	if (unlikely(!wqe_p)) {
+		/* too many posted work requests: queue overflow */
+		if (bad_send_wr)
+			*bad_send_wr = cur_send_wr;
+		ehca_err(my_qp->ib_qp.device, "Too many posted WQEs "
+			 "qp_num=%x", my_qp->ib_qp.qp_num);
+		return -ENOMEM;
+	}
+	/* write a SEND WQE into the QUEUE */
+	ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, hidden);
+	/*
+	 * if something failed,
+	 * reset the free entry pointer to the start value
+	 */
+	if (unlikely(ret)) {
+		my_qp->ipz_squeue.current_q_offset = start_offset;
+		if (bad_send_wr)
+			*bad_send_wr = cur_send_wr;
+		ehca_err(my_qp->ib_qp.device, "Could not write WQE "
+			 "qp_num=%x", my_qp->ib_qp.qp_num);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 int ehca_post_send(struct ib_qp *qp,
 		   struct ib_send_wr *send_wr,
 		   struct ib_send_wr **bad_send_wr)
 {
 	struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
 	struct ib_send_wr *cur_send_wr;
-	struct ehca_wqe *wqe_p;
 	int wqe_cnt = 0;
 	int ret = 0;
 	unsigned long flags;
@@ -369,37 +420,33 @@ int ehca_post_send(struct ib_qp *qp,
 	/* LOCK the QUEUE */
 	spin_lock_irqsave(&my_qp->spinlock_s, flags);
 
+	/* Send an empty extra RDMA read if:
+	 *  1) there has been an RDMA read on this connection before
+	 *  2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets
+	 *  3) we can be sure that any previous extra RDMA read has been
+	 *     processed so we don't overflow the SQ
+	 */
+	if (unlikely(my_qp->unsol_ack_circ &&
+		     my_qp->packet_count > ACK_CIRC_THRESHOLD &&
+		     my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) {
+		/* insert an empty RDMA READ to fix up the remote QP state */
+		struct ib_send_wr circ_wr;
+		memset(&circ_wr, 0, sizeof(circ_wr));
+		circ_wr.opcode = IB_WR_RDMA_READ;
+		post_one_send(my_qp, &circ_wr, NULL, 1); /* ignore retcode */
+		wqe_cnt++;
+		ehca_dbg(qp->device, "posted circ wr  qp_num=%x", qp->qp_num);
+		my_qp->message_count = my_qp->packet_count = 0;
+	}
+
 	/* loop processes list of send reqs */
 	for (cur_send_wr = send_wr; cur_send_wr != NULL;
 	     cur_send_wr = cur_send_wr->next) {
-		u64 start_offset = my_qp->ipz_squeue.current_q_offset;
-		/* get pointer next to free WQE */
-		wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
-		if (unlikely(!wqe_p)) {
-			/* too many posted work requests: queue overflow */
-			if (bad_send_wr)
-				*bad_send_wr = cur_send_wr;
-			if (wqe_cnt == 0) {
-				ret = -ENOMEM;
-				ehca_err(qp->device, "Too many posted WQEs "
-					 "qp_num=%x", qp->qp_num);
-			}
-			goto post_send_exit0;
-		}
-		/* write a SEND WQE into the QUEUE */
-		ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr);
-		/*
-		 * if something failed,
-		 * reset the free entry pointer to the start value
-		 */
+		ret = post_one_send(my_qp, cur_send_wr, bad_send_wr, 0);
 		if (unlikely(ret)) {
-			my_qp->ipz_squeue.current_q_offset = start_offset;
-			*bad_send_wr = cur_send_wr;
-			if (wqe_cnt == 0) {
-				ret = -EINVAL;
-				ehca_err(qp->device, "Could not write WQE "
-					 "qp_num=%x", qp->qp_num);
-			}
+			/* if one or more WQEs were successful, don't fail */
+			if (wqe_cnt)
+				ret = 0;
 			goto post_send_exit0;
 		}
 		wqe_cnt++;
@@ -410,6 +457,7 @@ int ehca_post_send(struct ib_qp *qp,
 post_send_exit0:
 	iosync(); /* serialize GAL register access */
 	hipz_update_sqa(my_qp, wqe_cnt);
+	my_qp->message_count += wqe_cnt;
 	spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
 	return ret;
 }

+ 3 - 3
drivers/infiniband/hw/ehca/ehca_sqp.c

@@ -40,11 +40,8 @@
  */
 
 
-#include <linux/module.h>
-#include <linux/err.h>
 #include "ehca_classes.h"
 #include "ehca_tools.h"
-#include "ehca_qes.h"
 #include "ehca_iverbs.h"
 #include "hcp_if.h"
 
@@ -93,6 +90,9 @@ u64 ehca_define_sqp(struct ehca_shca *shca,
 		return H_PARAMETER;
 	}
 
+	if (ehca_nr_ports < 0) /* autodetect mode */
+		return H_SUCCESS;
+
 	for (counter = 0;
 	     shca->sport[port - 1].port_state != IB_PORT_ACTIVE &&
 		     counter < ehca_port_act_time;

+ 31 - 4
drivers/infiniband/hw/ipath/ipath_common.h

@@ -81,6 +81,16 @@
 #define IPATH_IB_LINK_LOOPBACK	6 /* enable local loopback */
 #define IPATH_IB_LINK_EXTERNAL	7 /* normal, disable local loopback */
 
+/*
+ * These 3 values (SDR and DDR may be ORed for auto-speed
+ * negotiation) are used for the 3rd argument to path_f_set_ib_cfg
+ * with cmd IPATH_IB_CFG_SPD_ENB, by direct calls or via sysfs.  They
+ * are also the the possible values for ipath_link_speed_enabled and active
+ * The values were chosen to match values used within the IB spec.
+ */
+#define IPATH_IB_SDR 1
+#define IPATH_IB_DDR 2
+
 /*
  * stats maintained by the driver.  For now, at least, this is global
  * to all minor devices.
@@ -433,8 +443,9 @@ struct ipath_user_info {
 #define IPATH_CMD_UNUSED_2	26
 #define IPATH_CMD_PIOAVAILUPD	27	/* force an update of PIOAvail reg */
 #define IPATH_CMD_POLL_TYPE	28	/* set the kind of polling we want */
+#define IPATH_CMD_ARMLAUNCH_CTRL	29 /* armlaunch detection control */
 
-#define IPATH_CMD_MAX		28
+#define IPATH_CMD_MAX		29
 
 /*
  * Poll types
@@ -477,6 +488,8 @@ struct ipath_cmd {
 		__u64 port_info;
 		/* enable/disable receipt of packets */
 		__u32 recv_ctrl;
+		/* enable/disable armlaunch errors (non-zero to enable) */
+		__u32 armlaunch_ctrl;
 		/* partition key to set */
 		__u16 part_key;
 		/* user address of __u32 bitmask of active slaves */
@@ -579,7 +592,7 @@ struct ipath_flash {
 struct infinipath_counters {
 	__u64 LBIntCnt;
 	__u64 LBFlowStallCnt;
-	__u64 Reserved1;
+	__u64 TxSDmaDescCnt;	/* was Reserved1 */
 	__u64 TxUnsupVLErrCnt;
 	__u64 TxDataPktCnt;
 	__u64 TxFlowPktCnt;
@@ -615,12 +628,26 @@ struct infinipath_counters {
 	__u64 RxP6HdrEgrOvflCnt;
 	__u64 RxP7HdrEgrOvflCnt;
 	__u64 RxP8HdrEgrOvflCnt;
-	__u64 Reserved6;
-	__u64 Reserved7;
+	__u64 RxP9HdrEgrOvflCnt;	/* was Reserved6 */
+	__u64 RxP10HdrEgrOvflCnt;	/* was Reserved7 */
+	__u64 RxP11HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP12HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP13HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP14HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP15HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP16HdrEgrOvflCnt;	/* new for IBA7220 */
 	__u64 IBStatusChangeCnt;
 	__u64 IBLinkErrRecoveryCnt;
 	__u64 IBLinkDownedCnt;
 	__u64 IBSymbolErrCnt;
+	/* The following are new for IBA7220 */
+	__u64 RxVL15DroppedPktCnt;
+	__u64 RxOtherLocalPhyErrCnt;
+	__u64 PcieRetryBufDiagQwordCnt;
+	__u64 ExcessBufferOvflCnt;
+	__u64 LocalLinkIntegrityErrCnt;
+	__u64 RxVlErrCnt;
+	__u64 RxDlidFltrCnt;
 };
 
 /*

+ 1 - 1
drivers/infiniband/hw/ipath/ipath_cq.c

@@ -421,7 +421,7 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 	else
 		n = head - tail;
 	if (unlikely((u32)cqe < n)) {
-		ret = -EOVERFLOW;
+		ret = -EINVAL;
 		goto bail_unlock;
 	}
 	for (n = 0; tail != head; n++) {

+ 2 - 2
drivers/infiniband/hw/ipath/ipath_debug.h

@@ -55,7 +55,7 @@
 #define __IPATH_PKTDBG      0x80	/* print packet data */
 /* print process startup (init)/exit messages */
 #define __IPATH_PROCDBG     0x100
-/* print mmap/nopage stuff, not using VDBG any more */
+/* print mmap/fault stuff, not using VDBG any more */
 #define __IPATH_MMDBG       0x200
 #define __IPATH_ERRPKTDBG   0x400
 #define __IPATH_USER_SEND   0x1000	/* use user mode send */
@@ -81,7 +81,7 @@
 #define __IPATH_VERBDBG   0x0	/* very verbose debug */
 #define __IPATH_PKTDBG    0x0	/* print packet data */
 #define __IPATH_PROCDBG   0x0	/* process startup (init)/exit messages */
-/* print mmap/nopage stuff, not using VDBG any more */
+/* print mmap/fault stuff, not using VDBG any more */
 #define __IPATH_MMDBG     0x0
 #define __IPATH_EPKTDBG   0x0	/* print ethernet packet data */
 #define __IPATH_IPATHDBG  0x0	/* Ethernet (IPATH) table dump on */

+ 74 - 106
drivers/infiniband/hw/ipath/ipath_driver.c

@@ -334,6 +334,8 @@ static void ipath_verify_pioperf(struct ipath_devdata *dd)
 		udelay(1);
 	}
 
+	ipath_disable_armlaunch(dd);
+
 	writeq(0, piobuf); /* length 0, no dwords actually sent */
 	ipath_flush_wc();
 
@@ -365,6 +367,7 @@ static void ipath_verify_pioperf(struct ipath_devdata *dd)
 done:
 	/* disarm piobuf, so it's available again */
 	ipath_disarm_piobufs(dd, pbnum, 1);
+	ipath_enable_armlaunch(dd);
 }
 
 static int __devinit ipath_init_one(struct pci_dev *pdev,
@@ -803,31 +806,37 @@ void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first,
 			  unsigned cnt)
 {
 	unsigned i, last = first + cnt;
-	u64 sendctrl, sendorig;
+	unsigned long flags;
 
 	ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first);
-	sendorig = dd->ipath_sendctrl;
 	for (i = first; i < last; i++) {
-		sendctrl = sendorig  | INFINIPATH_S_DISARM |
-			(i << INFINIPATH_S_DISARMPIOBUF_SHIFT);
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		/*
+		 * The disarm-related bits are write-only, so it
+		 * is ok to OR them in with our copy of sendctrl
+		 * while we hold the lock.
+		 */
 		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-				 sendctrl);
+			dd->ipath_sendctrl | INFINIPATH_S_DISARM |
+			(i << INFINIPATH_S_DISARMPIOBUF_SHIFT));
+		/* can't disarm bufs back-to-back per iba7220 spec */
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
 	}
 
 	/*
-	 * Write it again with current value, in case ipath_sendctrl changed
-	 * while we were looping; no critical bits that would require
-	 * locking.
-	 *
-	 * disable PIOAVAILUPD, then re-enable, reading scratch in
+	 * Disable PIOAVAILUPD, then re-enable, reading scratch in
 	 * between.  This seems to avoid a chip timing race that causes
-	 * pioavail updates to memory to stop.
+	 * pioavail updates to memory to stop.  We xor as we don't
+	 * know the state of the bit when we're called.
 	 */
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-			 sendorig & ~INFINIPATH_S_PIOBUFAVAILUPD);
-	sendorig = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		dd->ipath_sendctrl ^ INFINIPATH_S_PIOBUFAVAILUPD);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
 			 dd->ipath_sendctrl);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
 }
 
 /**
@@ -1003,12 +1012,10 @@ static void get_rhf_errstring(u32 err, char *msg, size_t len)
  * ipath_get_egrbuf - get an eager buffer
  * @dd: the infinipath device
  * @bufnum: the eager buffer to get
- * @err: unused
  *
  * must only be called if ipath_pd[port] is known to be allocated
  */
-static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum,
-				     int err)
+static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum)
 {
 	return dd->ipath_port0_skbinfo ?
 		(void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL;
@@ -1100,13 +1107,14 @@ static void ipath_rcv_hdrerr(struct ipath_devdata *dd,
 
 /*
  * ipath_kreceive - receive a packet
- * @dd: the infinipath device
+ * @pd: the infinipath port
  *
  * called from interrupt handler for errors or receive interrupt
  */
-void ipath_kreceive(struct ipath_devdata *dd)
+void ipath_kreceive(struct ipath_portdata *pd)
 {
 	u64 *rc;
+	struct ipath_devdata *dd = pd->port_dd;
 	void *ebuf;
 	const u32 rsize = dd->ipath_rcvhdrentsize;	/* words */
 	const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize;	/* words */
@@ -1121,8 +1129,8 @@ void ipath_kreceive(struct ipath_devdata *dd)
 		goto bail;
 	}
 
-	l = dd->ipath_port0head;
-	hdrqtail = (u32) le64_to_cpu(*dd->ipath_hdrqtailptr);
+	l = pd->port_head;
+	hdrqtail = ipath_get_rcvhdrtail(pd);
 	if (l == hdrqtail)
 		goto bail;
 
@@ -1131,7 +1139,7 @@ reloop:
 		u32 qp;
 		u8 *bthbytes;
 
-		rc = (u64 *) (dd->ipath_pd[0]->port_rcvhdrq + (l << 2));
+		rc = (u64 *) (pd->port_rcvhdrq + (l << 2));
 		hdr = (struct ipath_message_header *)&rc[1];
 		/*
 		 * could make a network order version of IPATH_KD_QP, and
@@ -1156,7 +1164,7 @@ reloop:
 			etail = ipath_hdrget_index((__le32 *) rc);
 			if (tlen > sizeof(*hdr) ||
 			    etype == RCVHQ_RCV_TYPE_NON_KD)
-				ebuf = ipath_get_egrbuf(dd, etail, 0);
+				ebuf = ipath_get_egrbuf(dd, etail);
 		}
 
 		/*
@@ -1191,7 +1199,7 @@ reloop:
 				  be32_to_cpu(hdr->bth[0]) & 0xff);
 		else {
 			/*
-			 * error packet, type of error	unknown.
+			 * error packet, type of error unknown.
 			 * Probably type 3, but we don't know, so don't
 			 * even try to print the opcode, etc.
 			 */
@@ -1241,7 +1249,7 @@ reloop:
 		 * earlier packets, we "almost" guarantee we have covered
 		 * that case.
 		 */
-		u32 hqtail = (u32)le64_to_cpu(*dd->ipath_hdrqtailptr);
+		u32 hqtail = ipath_get_rcvhdrtail(pd);
 		if (hqtail != hdrqtail) {
 			hdrqtail = hqtail;
 			reloop = 1; /* loop 1 extra time at most */
@@ -1251,7 +1259,7 @@ reloop:
 
 	pkttot += i;
 
-	dd->ipath_port0head = l;
+	pd->port_head = l;
 
 	if (pkttot > ipath_stats.sps_maxpkts_call)
 		ipath_stats.sps_maxpkts_call = pkttot;
@@ -1335,14 +1343,9 @@ static void ipath_update_pio_bufs(struct ipath_devdata *dd)
 		/*
 		 * Chip Errata: bug 6641; even and odd qwords>3 are swapped
 		 */
-		if (i > 3) {
-			if (i & 1)
-				piov = le64_to_cpu(
-					dd->ipath_pioavailregs_dma[i - 1]);
-			else
-				piov = le64_to_cpu(
-					dd->ipath_pioavailregs_dma[i + 1]);
-		} else
+		if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
+			piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]);
+		else
 			piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]);
 		pchg = _IPATH_ALL_CHECKBITS &
 			~(dd->ipath_pioavailshadow[i] ^ piov);
@@ -1601,7 +1604,8 @@ int ipath_create_rcvhdrq(struct ipath_devdata *dd,
 
 	/* clear for security and sanity on each use */
 	memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size);
-	memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE);
+	if (pd->port_rcvhdrtail_kvaddr)
+		memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE);
 
 	/*
 	 * tell chip each time we init it, even if we are re-using previous
@@ -1617,77 +1621,6 @@ bail:
 	return ret;
 }
 
-int ipath_waitfor_complete(struct ipath_devdata *dd, ipath_kreg reg_id,
-			   u64 bits_to_wait_for, u64 * valp)
-{
-	unsigned long timeout;
-	u64 lastval, val;
-	int ret;
-
-	lastval = ipath_read_kreg64(dd, reg_id);
-	/* wait a ridiculously long time */
-	timeout = jiffies + msecs_to_jiffies(5);
-	do {
-		val = ipath_read_kreg64(dd, reg_id);
-		/* set so they have something, even on failures. */
-		*valp = val;
-		if ((val & bits_to_wait_for) == bits_to_wait_for) {
-			ret = 0;
-			break;
-		}
-		if (val != lastval)
-			ipath_cdbg(VERBOSE, "Changed from %llx to %llx, "
-				   "waiting for %llx bits\n",
-				   (unsigned long long) lastval,
-				   (unsigned long long) val,
-				   (unsigned long long) bits_to_wait_for);
-		cond_resched();
-		if (time_after(jiffies, timeout)) {
-			ipath_dbg("Didn't get bits %llx in register 0x%x, "
-				  "got %llx\n",
-				  (unsigned long long) bits_to_wait_for,
-				  reg_id, (unsigned long long) *valp);
-			ret = -ENODEV;
-			break;
-		}
-	} while (1);
-
-	return ret;
-}
-
-/**
- * ipath_waitfor_mdio_cmdready - wait for last command to complete
- * @dd: the infinipath device
- *
- * Like ipath_waitfor_complete(), but we wait for the CMDVALID bit to go
- * away indicating the last command has completed.  It doesn't return data
- */
-int ipath_waitfor_mdio_cmdready(struct ipath_devdata *dd)
-{
-	unsigned long timeout;
-	u64 val;
-	int ret;
-
-	/* wait a ridiculously long time */
-	timeout = jiffies + msecs_to_jiffies(5);
-	do {
-		val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_mdio);
-		if (!(val & IPATH_MDIO_CMDVALID)) {
-			ret = 0;
-			break;
-		}
-		cond_resched();
-		if (time_after(jiffies, timeout)) {
-			ipath_dbg("CMDVALID stuck in mdio reg? (%llx)\n",
-				  (unsigned long long) val);
-			ret = -ENODEV;
-			break;
-		}
-	} while (1);
-
-	return ret;
-}
-
 
 /*
  * Flush all sends that might be in the ready to send state, as well as any
@@ -2056,6 +1989,8 @@ void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val)
  */
 void ipath_shutdown_device(struct ipath_devdata *dd)
 {
+	unsigned long flags;
+
 	ipath_dbg("Shutting down the device\n");
 
 	dd->ipath_flags |= IPATH_LINKUNK;
@@ -2076,9 +2011,13 @@ void ipath_shutdown_device(struct ipath_devdata *dd)
 	 * gracefully stop all sends allowing any in progress to trickle out
 	 * first.
 	 */
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, 0ULL);
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl = 0;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
 	/* flush it */
 	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
 	/*
 	 * enough for anything that's going to trickle out to have actually
 	 * done so.
@@ -2335,5 +2274,34 @@ int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv)
 	}
 	return 0;
 }
+
+/*
+ * Disable and enable the armlaunch error.  Used for PIO bandwidth testing on
+ * the 7220, which is count-based, rather than trigger-based.  Safe for the
+ * driver check, since it's at init.   Not completely safe when used for
+ * user-mode checking, since some error checking can be lost, but not
+ * particularly risky, and only has problematic side-effects in the face of
+ * very buggy user code.  There is no reference counting, but that's also
+ * fine, given the intended use.
+ */
+void ipath_enable_armlaunch(struct ipath_devdata *dd)
+{
+	dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+		INFINIPATH_E_SPIOARMLAUNCH);
+	dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+		dd->ipath_errormask);
+}
+
+void ipath_disable_armlaunch(struct ipath_devdata *dd)
+{
+	/* so don't re-enable if already set */
+	dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH;
+	dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+		dd->ipath_errormask);
+}
+
 module_init(infinipath_init);
 module_exit(infinipath_cleanup);

+ 11 - 12
drivers/infiniband/hw/ipath/ipath_eeprom.c

@@ -510,10 +510,10 @@ int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset,
 {
 	int ret;
 
-	ret = down_interruptible(&dd->ipath_eep_sem);
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
 	if (!ret) {
 		ret = ipath_eeprom_internal_read(dd, eeprom_offset, buff, len);
-		up(&dd->ipath_eep_sem);
+		mutex_unlock(&dd->ipath_eep_lock);
 	}
 
 	return ret;
@@ -524,10 +524,10 @@ int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset,
 {
 	int ret;
 
-	ret = down_interruptible(&dd->ipath_eep_sem);
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
 	if (!ret) {
 		ret = ipath_eeprom_internal_write(dd, eeprom_offset, buff, len);
-		up(&dd->ipath_eep_sem);
+		mutex_unlock(&dd->ipath_eep_lock);
 	}
 
 	return ret;
@@ -574,7 +574,7 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd)
 	struct ipath_devdata *dd0 = ipath_lookup(0);
 
 	if (t && dd0->ipath_nguid > 1 && t <= dd0->ipath_nguid) {
-		u8 *bguid, oguid;
+		u8 oguid;
 		dd->ipath_guid = dd0->ipath_guid;
 		bguid = (u8 *) & dd->ipath_guid;
 
@@ -616,9 +616,9 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd)
 		goto bail;
 	}
 
-	down(&dd->ipath_eep_sem);
+	mutex_lock(&dd->ipath_eep_lock);
 	eep_stat = ipath_eeprom_internal_read(dd, 0, buf, len);
-	up(&dd->ipath_eep_sem);
+	mutex_unlock(&dd->ipath_eep_lock);
 
 	if (eep_stat) {
 		ipath_dev_err(dd, "Failed reading GUID from eeprom\n");
@@ -674,7 +674,6 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd)
 		 * elsewhere for backward-compatibility.
 		 */
 		char *snp = dd->ipath_serial;
-		int len;
 		memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix);
 		snp[sizeof ifp->if_sprefix] = '\0';
 		len = strlen(snp);
@@ -764,14 +763,14 @@ int ipath_update_eeprom_log(struct ipath_devdata *dd)
 	/* Grab semaphore and read current EEPROM. If we get an
 	 * error, let go, but if not, keep it until we finish write.
 	 */
-	ret = down_interruptible(&dd->ipath_eep_sem);
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
 	if (ret) {
 		ipath_dev_err(dd, "Unable to acquire EEPROM for logging\n");
 		goto free_bail;
 	}
 	ret = ipath_eeprom_internal_read(dd, 0, buf, len);
 	if (ret) {
-		up(&dd->ipath_eep_sem);
+		mutex_unlock(&dd->ipath_eep_lock);
 		ipath_dev_err(dd, "Unable read EEPROM for logging\n");
 		goto free_bail;
 	}
@@ -779,7 +778,7 @@ int ipath_update_eeprom_log(struct ipath_devdata *dd)
 
 	csum = flash_csum(ifp, 0);
 	if (csum != ifp->if_csum) {
-		up(&dd->ipath_eep_sem);
+		mutex_unlock(&dd->ipath_eep_lock);
 		ipath_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n",
 				csum, ifp->if_csum);
 		ret = 1;
@@ -849,7 +848,7 @@ int ipath_update_eeprom_log(struct ipath_devdata *dd)
 		csum = flash_csum(ifp, 1);
 		ret = ipath_eeprom_internal_write(dd, 0, buf, hi_water + 1);
 	}
-	up(&dd->ipath_eep_sem);
+	mutex_unlock(&dd->ipath_eep_lock);
 	if (ret)
 		ipath_dev_err(dd, "Failed updating EEPROM\n");
 

+ 51 - 43
drivers/infiniband/hw/ipath/ipath_file_ops.c

@@ -169,7 +169,7 @@ static int ipath_get_base_info(struct file *fp,
 		kinfo->spi_piocnt = dd->ipath_pbufsport;
 		kinfo->spi_piobufbase = (u64) pd->port_piobufs;
 		kinfo->__spi_uregbase = (u64) dd->ipath_uregbase +
-			dd->ipath_palign * pd->port_port;
+			dd->ipath_ureg_align * pd->port_port;
 	} else if (master) {
 		kinfo->spi_piocnt = (dd->ipath_pbufsport / subport_cnt) +
 				    (dd->ipath_pbufsport % subport_cnt);
@@ -186,7 +186,7 @@ static int ipath_get_base_info(struct file *fp,
 	}
 	if (shared) {
 		kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase +
-			dd->ipath_palign * pd->port_port;
+			dd->ipath_ureg_align * pd->port_port;
 		kinfo->spi_port_rcvegrbuf = kinfo->spi_rcv_egrbufs;
 		kinfo->spi_port_rcvhdr_base = kinfo->spi_rcvhdr_base;
 		kinfo->spi_port_rcvhdr_tailaddr = kinfo->spi_rcvhdr_tailaddr;
@@ -742,11 +742,12 @@ static int ipath_manage_rcvq(struct ipath_portdata *pd, unsigned subport,
 		 * updated and correct itself, even in the face of software
 		 * bugs.
 		 */
-		*(volatile u64 *)pd->port_rcvhdrtail_kvaddr = 0;
-		set_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port,
+		if (pd->port_rcvhdrtail_kvaddr)
+			ipath_clear_rcvhdrtail(pd);
+		set_bit(dd->ipath_r_portenable_shift + pd->port_port,
 			&dd->ipath_rcvctrl);
 	} else
-		clear_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port,
+		clear_bit(dd->ipath_r_portenable_shift + pd->port_port,
 			  &dd->ipath_rcvctrl);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
 			 dd->ipath_rcvctrl);
@@ -881,7 +882,7 @@ static int ipath_create_user_egr(struct ipath_portdata *pd)
 
 	egrcnt = dd->ipath_rcvegrcnt;
 	/* TID number offset for this port */
-	egroff = pd->port_port * egrcnt;
+	egroff = (pd->port_port - 1) * egrcnt + dd->ipath_p0_rcvegrcnt;
 	egrsize = dd->ipath_rcvegrbufsize;
 	ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid "
 		   "offset %x, egrsize %u\n", egrcnt, egroff, egrsize);
@@ -1049,11 +1050,6 @@ static int mmap_piobufs(struct vm_area_struct *vma,
 
 	phys = dd->ipath_physaddr + piobufs;
 
-	/*
-	 * Don't mark this as non-cached, or we don't get the
-	 * write combining behavior we want on the PIO buffers!
-	 */
-
 #if defined(__powerpc__)
 	/* There isn't a generic way to specify writethrough mappings */
 	pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
@@ -1120,33 +1116,24 @@ bail:
 }
 
 /*
- * ipath_file_vma_nopage - handle a VMA page fault.
+ * ipath_file_vma_fault - handle a VMA page fault.
  */
-static struct page *ipath_file_vma_nopage(struct vm_area_struct *vma,
-					  unsigned long address, int *type)
+static int ipath_file_vma_fault(struct vm_area_struct *vma,
+					struct vm_fault *vmf)
 {
-	unsigned long offset = address - vma->vm_start;
-	struct page *page = NOPAGE_SIGBUS;
-	void *pageptr;
+	struct page *page;
 
-	/*
-	 * Convert the vmalloc address into a struct page.
-	 */
-	pageptr = (void *)(offset + (vma->vm_pgoff << PAGE_SHIFT));
-	page = vmalloc_to_page(pageptr);
+	page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
 	if (!page)
-		goto out;
-
-	/* Increment the reference count. */
+		return VM_FAULT_SIGBUS;
 	get_page(page);
-	if (type)
-		*type = VM_FAULT_MINOR;
-out:
-	return page;
+	vmf->page = page;
+
+	return 0;
 }
 
 static struct vm_operations_struct ipath_file_vm_ops = {
-	.nopage = ipath_file_vma_nopage,
+	.fault = ipath_file_vma_fault,
 };
 
 static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
@@ -1284,7 +1271,7 @@ static int ipath_mmap(struct file *fp, struct vm_area_struct *vma)
 		goto bail;
 	}
 
-	ureg = dd->ipath_uregbase + dd->ipath_palign * pd->port_port;
+	ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port;
 	if (!pd->port_subport_cnt) {
 		/* port is not shared */
 		piocnt = dd->ipath_pbufsport;
@@ -1400,7 +1387,10 @@ static unsigned int ipath_poll_next(struct ipath_portdata *pd,
 	pollflag = ipath_poll_hdrqfull(pd);
 
 	head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port);
-	tail = *(volatile u64 *)pd->port_rcvhdrtail_kvaddr;
+	if (pd->port_rcvhdrtail_kvaddr)
+		tail = ipath_get_rcvhdrtail(pd);
+	else
+		tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
 
 	if (head != tail)
 		pollflag |= POLLIN | POLLRDNORM;
@@ -1410,7 +1400,7 @@ static unsigned int ipath_poll_next(struct ipath_portdata *pd,
 		/* flush waiting flag so we don't miss an event */
 		wmb();
 
-		set_bit(pd->port_port + INFINIPATH_R_INTRAVAIL_SHIFT,
+		set_bit(pd->port_port + dd->ipath_r_intravail_shift,
 			&dd->ipath_rcvctrl);
 
 		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
@@ -1790,6 +1780,7 @@ static int find_shared_port(struct file *fp,
 			}
 			port_fp(fp) = pd;
 			subport_fp(fp) = pd->port_cnt++;
+			pd->port_subpid[subport_fp(fp)] = current->pid;
 			tidcursor_fp(fp) = 0;
 			pd->active_slaves |= 1 << subport_fp(fp);
 			ipath_cdbg(PROC,
@@ -1920,8 +1911,7 @@ static int ipath_do_user_init(struct file *fp,
 	 */
 	head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port);
 	ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port);
-	dd->ipath_lastegrheads[pd->port_port] = -1;
-	dd->ipath_lastrcvhdrqtails[pd->port_port] = -1;
+	pd->port_lastrcvhdrqtail = -1;
 	ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n",
 		pd->port_port, head32);
 	pd->port_tidcursor = 0;	/* start at beginning after open */
@@ -1941,11 +1931,13 @@ static int ipath_do_user_init(struct file *fp,
 	 * We explictly set the in-memory copy to 0 beforehand, so we don't
 	 * have to wait to be sure the DMA update has happened.
 	 */
-	*(volatile u64 *)pd->port_rcvhdrtail_kvaddr = 0ULL;
-	set_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port,
+	if (pd->port_rcvhdrtail_kvaddr)
+		ipath_clear_rcvhdrtail(pd);
+	set_bit(dd->ipath_r_portenable_shift + pd->port_port,
 		&dd->ipath_rcvctrl);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
-			 dd->ipath_rcvctrl & ~INFINIPATH_R_TAILUPD);
+			dd->ipath_rcvctrl &
+			~(1ULL << dd->ipath_r_tailupd_shift));
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
 			 dd->ipath_rcvctrl);
 	/* Notify any waiting slaves */
@@ -2022,6 +2014,7 @@ static int ipath_close(struct inode *in, struct file *fp)
 		 * the slave(s) don't wait for receive data forever.
 		 */
 		pd->active_slaves &= ~(1 << fd->subport);
+		pd->port_subpid[fd->subport] = 0;
 		mutex_unlock(&ipath_mutex);
 		goto bail;
 	}
@@ -2054,9 +2047,9 @@ static int ipath_close(struct inode *in, struct file *fp)
 	if (dd->ipath_kregbase) {
 		int i;
 		/* atomically clear receive enable port and intr avail. */
-		clear_bit(INFINIPATH_R_PORTENABLE_SHIFT + port,
+		clear_bit(dd->ipath_r_portenable_shift + port,
 			  &dd->ipath_rcvctrl);
-		clear_bit(pd->port_port + INFINIPATH_R_INTRAVAIL_SHIFT,
+		clear_bit(pd->port_port + dd->ipath_r_intravail_shift,
 			  &dd->ipath_rcvctrl);
 		ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl,
 			dd->ipath_rcvctrl);
@@ -2149,11 +2142,15 @@ static int ipath_get_slave_info(struct ipath_portdata *pd,
 
 static int ipath_force_pio_avail_update(struct ipath_devdata *dd)
 {
-	u64 reg = dd->ipath_sendctrl;
+	unsigned long flags;
 
-	clear_bit(IPATH_S_PIOBUFAVAILUPD, &reg);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, reg);
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+		dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
 
 	return 0;
 }
@@ -2227,6 +2224,11 @@ static ssize_t ipath_write(struct file *fp, const char __user *data,
 		dest = &cmd.cmd.poll_type;
 		src = &ucmd->cmd.poll_type;
 		break;
+	case IPATH_CMD_ARMLAUNCH_CTRL:
+		copy = sizeof(cmd.cmd.armlaunch_ctrl);
+		dest = &cmd.cmd.armlaunch_ctrl;
+		src = &ucmd->cmd.armlaunch_ctrl;
+		break;
 	default:
 		ret = -EINVAL;
 		goto bail;
@@ -2302,6 +2304,12 @@ static ssize_t ipath_write(struct file *fp, const char __user *data,
 	case IPATH_CMD_POLL_TYPE:
 		pd->poll_type = cmd.cmd.poll_type;
 		break;
+	case IPATH_CMD_ARMLAUNCH_CTRL:
+		if (cmd.cmd.armlaunch_ctrl)
+			ipath_enable_armlaunch(pd->port_dd);
+		else
+			ipath_disable_armlaunch(pd->port_dd);
+		break;
 	}
 
 	if (ret >= 0)

+ 4 - 10
drivers/infiniband/hw/ipath/ipath_fs.c

@@ -108,21 +108,16 @@ static const struct file_operations atomic_stats_ops = {
 	.read = atomic_stats_read,
 };
 
-#define NUM_COUNTERS sizeof(struct infinipath_counters) / sizeof(u64)
-
 static ssize_t atomic_counters_read(struct file *file, char __user *buf,
 				    size_t count, loff_t *ppos)
 {
-	u64 counters[NUM_COUNTERS];
-	u16 i;
+	struct infinipath_counters counters;
 	struct ipath_devdata *dd;
 
 	dd = file->f_path.dentry->d_inode->i_private;
+	dd->ipath_f_read_counters(dd, &counters);
 
-	for (i = 0; i < NUM_COUNTERS; i++)
-		counters[i] = ipath_snap_cntr(dd, i);
-
-	return simple_read_from_buffer(buf, count, ppos, counters,
+	return simple_read_from_buffer(buf, count, ppos, &counters,
 				       sizeof counters);
 }
 
@@ -243,8 +238,7 @@ static int create_device_files(struct super_block *sb,
 
 	snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
 	ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir,
-			  (struct file_operations *) &simple_dir_operations,
-			  dd);
+			  &simple_dir_operations, dd);
 	if (ret) {
 		printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret);
 		goto bail;

+ 346 - 49
drivers/infiniband/hw/ipath/ipath_iba6110.c

@@ -148,10 +148,57 @@ struct _infinipath_do_not_use_kernel_regs {
 	unsigned long long ReservedSW2[4];
 };
 
-#define IPATH_KREG_OFFSET(field) (offsetof(struct \
-    _infinipath_do_not_use_kernel_regs, field) / sizeof(u64))
+struct _infinipath_do_not_use_counters {
+	__u64 LBIntCnt;
+	__u64 LBFlowStallCnt;
+	__u64 Reserved1;
+	__u64 TxUnsupVLErrCnt;
+	__u64 TxDataPktCnt;
+	__u64 TxFlowPktCnt;
+	__u64 TxDwordCnt;
+	__u64 TxLenErrCnt;
+	__u64 TxMaxMinLenErrCnt;
+	__u64 TxUnderrunCnt;
+	__u64 TxFlowStallCnt;
+	__u64 TxDroppedPktCnt;
+	__u64 RxDroppedPktCnt;
+	__u64 RxDataPktCnt;
+	__u64 RxFlowPktCnt;
+	__u64 RxDwordCnt;
+	__u64 RxLenErrCnt;
+	__u64 RxMaxMinLenErrCnt;
+	__u64 RxICRCErrCnt;
+	__u64 RxVCRCErrCnt;
+	__u64 RxFlowCtrlErrCnt;
+	__u64 RxBadFormatCnt;
+	__u64 RxLinkProblemCnt;
+	__u64 RxEBPCnt;
+	__u64 RxLPCRCErrCnt;
+	__u64 RxBufOvflCnt;
+	__u64 RxTIDFullErrCnt;
+	__u64 RxTIDValidErrCnt;
+	__u64 RxPKeyMismatchCnt;
+	__u64 RxP0HdrEgrOvflCnt;
+	__u64 RxP1HdrEgrOvflCnt;
+	__u64 RxP2HdrEgrOvflCnt;
+	__u64 RxP3HdrEgrOvflCnt;
+	__u64 RxP4HdrEgrOvflCnt;
+	__u64 RxP5HdrEgrOvflCnt;
+	__u64 RxP6HdrEgrOvflCnt;
+	__u64 RxP7HdrEgrOvflCnt;
+	__u64 RxP8HdrEgrOvflCnt;
+	__u64 Reserved6;
+	__u64 Reserved7;
+	__u64 IBStatusChangeCnt;
+	__u64 IBLinkErrRecoveryCnt;
+	__u64 IBLinkDownedCnt;
+	__u64 IBSymbolErrCnt;
+};
+
+#define IPATH_KREG_OFFSET(field) (offsetof( \
+	struct _infinipath_do_not_use_kernel_regs, field) / sizeof(u64))
 #define IPATH_CREG_OFFSET(field) (offsetof( \
-    struct infinipath_counters, field) / sizeof(u64))
+	struct _infinipath_do_not_use_counters, field) / sizeof(u64))
 
 static const struct ipath_kregs ipath_ht_kregs = {
 	.kr_control = IPATH_KREG_OFFSET(Control),
@@ -282,6 +329,9 @@ static const struct ipath_cregs ipath_ht_cregs = {
 #define INFINIPATH_HWE_HTAPLL_RFSLIP        0x1000000000000000ULL
 #define INFINIPATH_HWE_SERDESPLLFAILED      0x2000000000000000ULL
 
+#define IBA6110_IBCS_LINKTRAININGSTATE_MASK 0xf
+#define IBA6110_IBCS_LINKSTATE_SHIFT 4
+
 /* kr_extstatus bits */
 #define INFINIPATH_EXTS_FREQSEL 0x2
 #define INFINIPATH_EXTS_SERDESSEL 0x4
@@ -296,6 +346,12 @@ static const struct ipath_cregs ipath_ht_cregs = {
 #define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL
 #define INFINIPATH_RT_BUFSIZE_SHIFT 48
 
+#define INFINIPATH_R_INTRAVAIL_SHIFT 16
+#define INFINIPATH_R_TAILUPD_SHIFT 31
+
+/* kr_xgxsconfig bits */
+#define INFINIPATH_XGXS_RESET          0x7ULL
+
 /*
  * masks and bits that are different in different chips, or present only
  * in one
@@ -652,7 +708,6 @@ static int ipath_ht_boardname(struct ipath_devdata *dd, char *name,
 			      "with ID %u\n", boardrev);
 		snprintf(name, namelen, "Unknown_InfiniPath_QHT7xxx_%u",
 			 boardrev);
-		ret = 1;
 		break;
 	}
 	if (n)
@@ -686,6 +741,13 @@ static int ipath_ht_boardname(struct ipath_devdata *dd, char *name,
 			      dd->ipath_htspeed);
 	ret = 0;
 
+	/*
+	 * set here, not in ipath_init_*_funcs because we have to do
+	 * it after we can read chip registers.
+	 */
+	dd->ipath_ureg_align =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
+
 bail:
 	return ret;
 }
@@ -969,7 +1031,8 @@ static int ipath_setup_ht_config(struct ipath_devdata *dd,
 	do {
 		u8 cap_type;
 
-		/* the HT capability type byte is 3 bytes after the
+		/*
+		 * The HT capability type byte is 3 bytes after the
 		 * capability byte.
 		 */
 		if (pci_read_config_byte(pdev, pos + 3, &cap_type)) {
@@ -982,6 +1045,8 @@ static int ipath_setup_ht_config(struct ipath_devdata *dd,
 	} while ((pos = pci_find_next_capability(pdev, pos,
 						 PCI_CAP_ID_HT)));
 
+	dd->ipath_flags |= IPATH_SWAP_PIOBUFS;
+
 bail:
 	return ret;
 }
@@ -1074,11 +1139,55 @@ static void ipath_setup_ht_setextled(struct ipath_devdata *dd,
 
 static void ipath_init_ht_variables(struct ipath_devdata *dd)
 {
+	/*
+	 * setup the register offsets, since they are different for each
+	 * chip
+	 */
+	dd->ipath_kregs = &ipath_ht_kregs;
+	dd->ipath_cregs = &ipath_ht_cregs;
+
 	dd->ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM;
 	dd->ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM;
 	dd->ipath_gpio_sda = IPATH_GPIO_SDA;
 	dd->ipath_gpio_scl = IPATH_GPIO_SCL;
 
+	/*
+	 * Fill in data for field-values that change in newer chips.
+	 * We dynamically specify only the mask for LINKTRAININGSTATE
+	 * and only the shift for LINKSTATE, as they are the only ones
+	 * that change.  Also precalculate the 3 link states of interest
+	 * and the combined mask.
+	 */
+	dd->ibcs_ls_shift = IBA6110_IBCS_LINKSTATE_SHIFT;
+	dd->ibcs_lts_mask = IBA6110_IBCS_LINKTRAININGSTATE_MASK;
+	dd->ibcs_mask = (INFINIPATH_IBCS_LINKSTATE_MASK <<
+		dd->ibcs_ls_shift) | dd->ibcs_lts_mask;
+	dd->ib_init = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+		(INFINIPATH_IBCS_L_STATE_INIT << dd->ibcs_ls_shift);
+	dd->ib_arm = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+		(INFINIPATH_IBCS_L_STATE_ARM << dd->ibcs_ls_shift);
+	dd->ib_active = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+		(INFINIPATH_IBCS_L_STATE_ACTIVE << dd->ibcs_ls_shift);
+
+	/*
+	 * Fill in data for ibcc field-values that change in newer chips.
+	 * We dynamically specify only the mask for LINKINITCMD
+	 * and only the shift for LINKCMD and MAXPKTLEN, as they are
+	 * the only ones that change.
+	 */
+	dd->ibcc_lic_mask = INFINIPATH_IBCC_LINKINITCMD_MASK;
+	dd->ibcc_lc_shift = INFINIPATH_IBCC_LINKCMD_SHIFT;
+	dd->ibcc_mpl_shift = INFINIPATH_IBCC_MAXPKTLEN_SHIFT;
+
+	/* Fill in shifts for RcvCtrl. */
+	dd->ipath_r_portenable_shift = INFINIPATH_R_PORTENABLE_SHIFT;
+	dd->ipath_r_intravail_shift = INFINIPATH_R_INTRAVAIL_SHIFT;
+	dd->ipath_r_tailupd_shift = INFINIPATH_R_TAILUPD_SHIFT;
+	dd->ipath_r_portcfg_shift = 0; /* Not on IBA6110 */
+
 	dd->ipath_i_bitsextant =
 		(INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) |
 		(INFINIPATH_I_RCVAVAIL_MASK <<
@@ -1135,6 +1244,8 @@ static void ipath_init_ht_variables(struct ipath_devdata *dd)
 
 	dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
 	dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
+	dd->ipath_i_rcvavail_shift = INFINIPATH_I_RCVAVAIL_SHIFT;
+	dd->ipath_i_rcvurg_shift = INFINIPATH_I_RCVURG_SHIFT;
 
 	/*
 	 * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
@@ -1148,9 +1259,17 @@ static void ipath_init_ht_variables(struct ipath_devdata *dd)
 		INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
 		INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT;
 
-	dd->ipath_eep_st_masks[2].errs_to_log =
-		INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET;
+	dd->ipath_eep_st_masks[2].errs_to_log = INFINIPATH_E_RESET;
 
+	dd->delay_mult = 2; /* SDR, 4X, can't change */
+
+	dd->ipath_link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+	dd->ipath_link_speed_supported = IPATH_IB_SDR;
+	dd->ipath_link_width_enabled = IB_WIDTH_4X;
+	dd->ipath_link_speed_enabled = dd->ipath_link_speed_supported;
+	/* these can't change for this chip, so set once */
+	dd->ipath_link_width_active = dd->ipath_link_width_enabled;
+	dd->ipath_link_speed_active = dd->ipath_link_speed_enabled;
 }
 
 /**
@@ -1205,14 +1324,16 @@ static void ipath_ht_init_hwerrors(struct ipath_devdata *dd)
 	val &= ~INFINIPATH_HWE_HTCMISCERR4;
 
 	/*
-	 * PLL ignored because MDIO interface has a logic problem
-	 * for reads, on Comstock and Ponderosa.  BRINGUP
+	 * PLL ignored because unused MDIO interface has a logic problem
 	 */
 	if (dd->ipath_boardrev == 4 || dd->ipath_boardrev == 9)
 		val &= ~INFINIPATH_HWE_SERDESPLLFAILED;
 	dd->ipath_hwerrmask = val;
 }
 
+
+
+
 /**
  * ipath_ht_bringup_serdes - bring up the serdes
  * @dd: the infinipath device
@@ -1284,16 +1405,6 @@ static int ipath_ht_bringup_serdes(struct ipath_devdata *dd)
 	}
 
 	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
-	if (((val >> INFINIPATH_XGXS_MDIOADDR_SHIFT) &
-	     INFINIPATH_XGXS_MDIOADDR_MASK) != 3) {
-		val &= ~(INFINIPATH_XGXS_MDIOADDR_MASK <<
-			 INFINIPATH_XGXS_MDIOADDR_SHIFT);
-		/*
-		 * we use address 3
-		 */
-		val |= 3ULL << INFINIPATH_XGXS_MDIOADDR_SHIFT;
-		change = 1;
-	}
 	if (val & INFINIPATH_XGXS_RESET) {
 		/* normally true after boot */
 		val &= ~INFINIPATH_XGXS_RESET;
@@ -1329,21 +1440,6 @@ static int ipath_ht_bringup_serdes(struct ipath_devdata *dd)
 		   (unsigned long long)
 		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
 
-	if (!ipath_waitfor_mdio_cmdready(dd)) {
-		ipath_write_kreg(dd, dd->ipath_kregs->kr_mdio,
-				 ipath_mdio_req(IPATH_MDIO_CMD_READ, 31,
-						IPATH_MDIO_CTRL_XGXS_REG_8,
-						0));
-		if (ipath_waitfor_complete(dd, dd->ipath_kregs->kr_mdio,
-					   IPATH_MDIO_DATAVALID, &val))
-			ipath_dbg("Never got MDIO data for XGXS status "
-				  "read\n");
-		else
-			ipath_cdbg(VERBOSE, "MDIO Read reg8, "
-				   "'bank' 31 %x\n", (u32) val);
-	} else
-		ipath_dbg("Never got MDIO cmdready for XGXS status read\n");
-
 	return ret;		/* for now, say we always succeeded */
 }
 
@@ -1396,6 +1492,7 @@ static void ipath_ht_put_tid(struct ipath_devdata *dd,
 			pa |= lenvalid | INFINIPATH_RT_VALID;
 		}
 	}
+
 	writeq(pa, tidptr);
 }
 
@@ -1526,8 +1623,7 @@ static int ipath_ht_early_init(struct ipath_devdata *dd)
 	}
 
 	ipath_get_eeprom_info(dd);
-	if (dd->ipath_boardrev == 5 && dd->ipath_serial[0] == '1' &&
-		dd->ipath_serial[1] == '2' && dd->ipath_serial[2] == '8') {
+	if (dd->ipath_boardrev == 5) {
 		/*
 		 * Later production QHT7040 has same changes as QHT7140, so
 		 * can use GPIO interrupts.  They have serial #'s starting
@@ -1602,6 +1698,210 @@ static void ipath_ht_free_irq(struct ipath_devdata *dd)
 	dd->ipath_intconfig = 0;
 }
 
+static struct ipath_message_header *
+ipath_ht_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr)
+{
+	return (struct ipath_message_header *)
+		&rhf_addr[sizeof(u64) / sizeof(u32)];
+}
+
+static void ipath_ht_config_ports(struct ipath_devdata *dd, ushort cfgports)
+{
+	dd->ipath_portcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt);
+	dd->ipath_p0_rcvegrcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+}
+
+static void ipath_ht_read_counters(struct ipath_devdata *dd,
+				   struct infinipath_counters *cntrs)
+{
+	cntrs->LBIntCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBIntCnt));
+	cntrs->LBFlowStallCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBFlowStallCnt));
+	cntrs->TxSDmaDescCnt = 0;
+	cntrs->TxUnsupVLErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnsupVLErrCnt));
+	cntrs->TxDataPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDataPktCnt));
+	cntrs->TxFlowPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowPktCnt));
+	cntrs->TxDwordCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDwordCnt));
+	cntrs->TxLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxLenErrCnt));
+	cntrs->TxMaxMinLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxMaxMinLenErrCnt));
+	cntrs->TxUnderrunCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnderrunCnt));
+	cntrs->TxFlowStallCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowStallCnt));
+	cntrs->TxDroppedPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDroppedPktCnt));
+	cntrs->RxDroppedPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDroppedPktCnt));
+	cntrs->RxDataPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDataPktCnt));
+	cntrs->RxFlowPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowPktCnt));
+	cntrs->RxDwordCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDwordCnt));
+	cntrs->RxLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLenErrCnt));
+	cntrs->RxMaxMinLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxMaxMinLenErrCnt));
+	cntrs->RxICRCErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxICRCErrCnt));
+	cntrs->RxVCRCErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxVCRCErrCnt));
+	cntrs->RxFlowCtrlErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowCtrlErrCnt));
+	cntrs->RxBadFormatCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBadFormatCnt));
+	cntrs->RxLinkProblemCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLinkProblemCnt));
+	cntrs->RxEBPCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxEBPCnt));
+	cntrs->RxLPCRCErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLPCRCErrCnt));
+	cntrs->RxBufOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBufOvflCnt));
+	cntrs->RxTIDFullErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDFullErrCnt));
+	cntrs->RxTIDValidErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDValidErrCnt));
+	cntrs->RxPKeyMismatchCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxPKeyMismatchCnt));
+	cntrs->RxP0HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt));
+	cntrs->RxP1HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP1HdrEgrOvflCnt));
+	cntrs->RxP2HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP2HdrEgrOvflCnt));
+	cntrs->RxP3HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP3HdrEgrOvflCnt));
+	cntrs->RxP4HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP4HdrEgrOvflCnt));
+	cntrs->RxP5HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP5HdrEgrOvflCnt));
+	cntrs->RxP6HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP6HdrEgrOvflCnt));
+	cntrs->RxP7HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP7HdrEgrOvflCnt));
+	cntrs->RxP8HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP8HdrEgrOvflCnt));
+	cntrs->RxP9HdrEgrOvflCnt = 0;
+	cntrs->RxP10HdrEgrOvflCnt = 0;
+	cntrs->RxP11HdrEgrOvflCnt = 0;
+	cntrs->RxP12HdrEgrOvflCnt = 0;
+	cntrs->RxP13HdrEgrOvflCnt = 0;
+	cntrs->RxP14HdrEgrOvflCnt = 0;
+	cntrs->RxP15HdrEgrOvflCnt = 0;
+	cntrs->RxP16HdrEgrOvflCnt = 0;
+	cntrs->IBStatusChangeCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBStatusChangeCnt));
+	cntrs->IBLinkErrRecoveryCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt));
+	cntrs->IBLinkDownedCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkDownedCnt));
+	cntrs->IBSymbolErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBSymbolErrCnt));
+	cntrs->RxVL15DroppedPktCnt = 0;
+	cntrs->RxOtherLocalPhyErrCnt = 0;
+	cntrs->PcieRetryBufDiagQwordCnt = 0;
+	cntrs->ExcessBufferOvflCnt = dd->ipath_overrun_thresh_errs;
+	cntrs->LocalLinkIntegrityErrCnt =
+		(dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
+		dd->ipath_lli_errs : dd->ipath_lli_errors;
+	cntrs->RxVlErrCnt = 0;
+	cntrs->RxDlidFltrCnt = 0;
+}
+
+
+/* no interrupt fallback for these chips */
+static int ipath_ht_nointr_fallback(struct ipath_devdata *dd)
+{
+	return 0;
+}
+
+
+/*
+ * reset the XGXS (between serdes and IBC).  Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining.  To do this right, we reset IBC
+ * as well.
+ */
+static void ipath_ht_xgxs_reset(struct ipath_devdata *dd)
+{
+	u64 val, prev_val;
+
+	prev_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+	val = prev_val | INFINIPATH_XGXS_RESET;
+	prev_val &= ~INFINIPATH_XGXS_RESET; /* be sure */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control & ~INFINIPATH_C_LINKENABLE);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+	ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, prev_val);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control);
+}
+
+
+static int ipath_ht_get_ib_cfg(struct ipath_devdata *dd, int which)
+{
+	int ret;
+
+	switch (which) {
+	case IPATH_IB_CFG_LWID:
+		ret = dd->ipath_link_width_active;
+		break;
+	case IPATH_IB_CFG_SPD:
+		ret = dd->ipath_link_speed_active;
+		break;
+	case IPATH_IB_CFG_LWID_ENB:
+		ret = dd->ipath_link_width_enabled;
+		break;
+	case IPATH_IB_CFG_SPD_ENB:
+		ret = dd->ipath_link_speed_enabled;
+		break;
+	default:
+		ret =  -ENOTSUPP;
+		break;
+	}
+	return ret;
+}
+
+
+/* we assume range checking is already done, if needed */
+static int ipath_ht_set_ib_cfg(struct ipath_devdata *dd, int which, u32 val)
+{
+	int ret = 0;
+
+	if (which == IPATH_IB_CFG_LWID_ENB)
+		dd->ipath_link_width_enabled = val;
+	else if (which == IPATH_IB_CFG_SPD_ENB)
+		dd->ipath_link_speed_enabled = val;
+	else
+		ret = -ENOTSUPP;
+	return ret;
+}
+
+
+static void ipath_ht_config_jint(struct ipath_devdata *dd, u16 a, u16 b)
+{
+}
+
+
+static int ipath_ht_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
+{
+	ipath_setup_ht_setextled(dd, ipath_ib_linkstate(dd, ibcs),
+		ipath_ib_linktrstate(dd, ibcs));
+	return 0;
+}
+
+
 /**
  * ipath_init_iba6110_funcs - set up the chip-specific function pointers
  * @dd: the infinipath device
@@ -1626,22 +1926,19 @@ void ipath_init_iba6110_funcs(struct ipath_devdata *dd)
 	dd->ipath_f_setextled = ipath_setup_ht_setextled;
 	dd->ipath_f_get_base_info = ipath_ht_get_base_info;
 	dd->ipath_f_free_irq = ipath_ht_free_irq;
-
-	/*
-	 * initialize chip-specific variables
-	 */
 	dd->ipath_f_tidtemplate = ipath_ht_tidtemplate;
+	dd->ipath_f_intr_fallback = ipath_ht_nointr_fallback;
+	dd->ipath_f_get_msgheader = ipath_ht_get_msgheader;
+	dd->ipath_f_config_ports = ipath_ht_config_ports;
+	dd->ipath_f_read_counters = ipath_ht_read_counters;
+	dd->ipath_f_xgxs_reset = ipath_ht_xgxs_reset;
+	dd->ipath_f_get_ib_cfg = ipath_ht_get_ib_cfg;
+	dd->ipath_f_set_ib_cfg = ipath_ht_set_ib_cfg;
+	dd->ipath_f_config_jint = ipath_ht_config_jint;
+	dd->ipath_f_ib_updown = ipath_ht_ib_updown;
 
 	/*
-	 * setup the register offsets, since they are different for each
-	 * chip
-	 */
-	dd->ipath_kregs = &ipath_ht_kregs;
-	dd->ipath_cregs = &ipath_ht_cregs;
-
-	/*
-	 * do very early init that is needed before ipath_f_bus is
-	 * called
+	 * initialize chip-specific variables
 	 */
 	ipath_init_ht_variables(dd);
 }

+ 350 - 89
drivers/infiniband/hw/ipath/ipath_iba6120.c

@@ -145,10 +145,57 @@ struct _infinipath_do_not_use_kernel_regs {
 	unsigned long long Reserved12;
 };
 
-#define IPATH_KREG_OFFSET(field) (offsetof(struct \
-    _infinipath_do_not_use_kernel_regs, field) / sizeof(u64))
+struct _infinipath_do_not_use_counters {
+	__u64 LBIntCnt;
+	__u64 LBFlowStallCnt;
+	__u64 Reserved1;
+	__u64 TxUnsupVLErrCnt;
+	__u64 TxDataPktCnt;
+	__u64 TxFlowPktCnt;
+	__u64 TxDwordCnt;
+	__u64 TxLenErrCnt;
+	__u64 TxMaxMinLenErrCnt;
+	__u64 TxUnderrunCnt;
+	__u64 TxFlowStallCnt;
+	__u64 TxDroppedPktCnt;
+	__u64 RxDroppedPktCnt;
+	__u64 RxDataPktCnt;
+	__u64 RxFlowPktCnt;
+	__u64 RxDwordCnt;
+	__u64 RxLenErrCnt;
+	__u64 RxMaxMinLenErrCnt;
+	__u64 RxICRCErrCnt;
+	__u64 RxVCRCErrCnt;
+	__u64 RxFlowCtrlErrCnt;
+	__u64 RxBadFormatCnt;
+	__u64 RxLinkProblemCnt;
+	__u64 RxEBPCnt;
+	__u64 RxLPCRCErrCnt;
+	__u64 RxBufOvflCnt;
+	__u64 RxTIDFullErrCnt;
+	__u64 RxTIDValidErrCnt;
+	__u64 RxPKeyMismatchCnt;
+	__u64 RxP0HdrEgrOvflCnt;
+	__u64 RxP1HdrEgrOvflCnt;
+	__u64 RxP2HdrEgrOvflCnt;
+	__u64 RxP3HdrEgrOvflCnt;
+	__u64 RxP4HdrEgrOvflCnt;
+	__u64 RxP5HdrEgrOvflCnt;
+	__u64 RxP6HdrEgrOvflCnt;
+	__u64 RxP7HdrEgrOvflCnt;
+	__u64 RxP8HdrEgrOvflCnt;
+	__u64 Reserved6;
+	__u64 Reserved7;
+	__u64 IBStatusChangeCnt;
+	__u64 IBLinkErrRecoveryCnt;
+	__u64 IBLinkDownedCnt;
+	__u64 IBSymbolErrCnt;
+};
+
+#define IPATH_KREG_OFFSET(field) (offsetof( \
+	struct _infinipath_do_not_use_kernel_regs, field) / sizeof(u64))
 #define IPATH_CREG_OFFSET(field) (offsetof( \
-    struct infinipath_counters, field) / sizeof(u64))
+	struct _infinipath_do_not_use_counters, field) / sizeof(u64))
 
 static const struct ipath_kregs ipath_pe_kregs = {
 	.kr_control = IPATH_KREG_OFFSET(Control),
@@ -282,6 +329,9 @@ static const struct ipath_cregs ipath_pe_cregs = {
 #define INFINIPATH_HWE_PCIE0PLLFAILED       0x0800000000000000ULL
 #define INFINIPATH_HWE_SERDESPLLFAILED      0x1000000000000000ULL
 
+#define IBA6120_IBCS_LINKTRAININGSTATE_MASK 0xf
+#define IBA6120_IBCS_LINKSTATE_SHIFT 4
+
 /* kr_extstatus bits */
 #define INFINIPATH_EXTS_FREQSEL 0x2
 #define INFINIPATH_EXTS_SERDESSEL 0x4
@@ -296,6 +346,9 @@ static const struct ipath_cregs ipath_pe_cregs = {
 #define IPATH_GPIO_SCL (1ULL << \
 	(_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
 
+#define INFINIPATH_R_INTRAVAIL_SHIFT 16
+#define INFINIPATH_R_TAILUPD_SHIFT 31
+
 /* 6120 specific hardware errors... */
 static const struct ipath_hwerror_msgs ipath_6120_hwerror_msgs[] = {
 	INFINIPATH_HWE_MSG(PCIEPOISONEDTLP, "PCIe Poisoned TLP"),
@@ -320,10 +373,28 @@ static const struct ipath_hwerror_msgs ipath_6120_hwerror_msgs[] = {
 		        INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \
 		        << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)
 
-static int ipath_pe_txe_recover(struct ipath_devdata *);
 static void ipath_pe_put_tid_2(struct ipath_devdata *, u64 __iomem *,
 			       u32, unsigned long);
 
+/*
+ * On platforms using this chip, and not having ordered WC stores, we
+ * can get TXE parity errors due to speculative reads to the PIO buffers,
+ * and this, due to a chip bug can result in (many) false parity error
+ * reports.  So it's a debug print on those, and an info print on systems
+ * where the speculative reads don't occur.
+ */
+static void ipath_pe_txe_recover(struct ipath_devdata *dd)
+{
+	if (ipath_unordered_wc())
+		ipath_dbg("Recovering from TXE PIO parity error\n");
+	else {
+		++ipath_stats.sps_txeparity;
+		dev_info(&dd->pcidev->dev,
+			"Recovering from TXE PIO parity error\n");
+	}
+}
+
+
 /**
  * ipath_pe_handle_hwerrors - display hardware errors.
  * @dd: the infinipath device
@@ -403,35 +474,11 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
 		 * occur if a processor speculative read is done to the PIO
 		 * buffer while we are sending a packet, for example.
 		 */
-		if ((hwerrs & TXE_PIO_PARITY) && ipath_pe_txe_recover(dd))
+		if (hwerrs & TXE_PIO_PARITY) {
+			ipath_pe_txe_recover(dd);
 			hwerrs &= ~TXE_PIO_PARITY;
-		if (hwerrs) {
-			/*
-			 * if any set that we aren't ignoring only make the
-			 * complaint once, in case it's stuck or recurring,
-			 * and we get here multiple times
-			 * Force link down, so switch knows, and
-			 * LEDs are turned off
-			 */
-			if (dd->ipath_flags & IPATH_INITTED) {
-				ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
-				ipath_setup_pe_setextled(dd,
-					INFINIPATH_IBCS_L_STATE_DOWN,
-					INFINIPATH_IBCS_LT_STATE_DISABLED);
-				ipath_dev_err(dd, "Fatal Hardware Error (freeze "
-					      "mode), no longer usable, SN %.16s\n",
-						  dd->ipath_serial);
-				isfatal = 1;
-			}
-			/*
-			 * Mark as having had an error for driver, and also
-			 * for /sys and status word mapped to user programs.
-			 * This marks unit as not usable, until reset
-			 */
-			*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
-			*dd->ipath_statusp |= IPATH_STATUS_HWERROR;
-			dd->ipath_flags &= ~IPATH_INITTED;
-		} else {
+		}
+		if (!hwerrs) {
 			static u32 freeze_cnt;
 
 			freeze_cnt++;
@@ -485,7 +532,7 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
 
 	if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) {
 		/*
-		 * If it occurs, it is left masked since the eternal
+		 * If it occurs, it is left masked since the external
 		 * interface is unused
 		 */
 		dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED;
@@ -563,6 +610,14 @@ static int ipath_pe_boardname(struct ipath_devdata *dd, char *name,
 			dd->ipath_f_put_tid = ipath_pe_put_tid_2;
 	}
 
+
+	/*
+	 * set here, not in ipath_init_*_funcs because we have to do
+	 * it after we can read chip registers.
+	 */
+	dd->ipath_ureg_align =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
+
 	return ret;
 }
 
@@ -667,17 +722,8 @@ static int ipath_pe_bringup_serdes(struct ipath_devdata *dd)
 
 	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
 	prev_val = val;
-	if (((val >> INFINIPATH_XGXS_MDIOADDR_SHIFT) &
-	     INFINIPATH_XGXS_MDIOADDR_MASK) != 3) {
-		val &=
-			~(INFINIPATH_XGXS_MDIOADDR_MASK <<
-			  INFINIPATH_XGXS_MDIOADDR_SHIFT);
-		/* MDIO address 3 */
-		val |= 3ULL << INFINIPATH_XGXS_MDIOADDR_SHIFT;
-	}
-	if (val & INFINIPATH_XGXS_RESET) {
+	if (val & INFINIPATH_XGXS_RESET)
 		val &= ~INFINIPATH_XGXS_RESET;
-	}
 	if (((val >> INFINIPATH_XGXS_RX_POL_SHIFT) &
 	     INFINIPATH_XGXS_RX_POL_MASK) != dd->ipath_rx_pol_inv ) {
 		/* need to compensate for Tx inversion in partner */
@@ -707,21 +753,6 @@ static int ipath_pe_bringup_serdes(struct ipath_devdata *dd)
 		   (unsigned long long)
 		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
 
-	if (!ipath_waitfor_mdio_cmdready(dd)) {
-		ipath_write_kreg(
-			dd, dd->ipath_kregs->kr_mdio,
-			ipath_mdio_req(IPATH_MDIO_CMD_READ, 31,
-				       IPATH_MDIO_CTRL_XGXS_REG_8, 0));
-		if (ipath_waitfor_complete(dd, dd->ipath_kregs->kr_mdio,
-					   IPATH_MDIO_DATAVALID, &val))
-			ipath_dbg("Never got MDIO data for XGXS "
-				  "status read\n");
-		else
-			ipath_cdbg(VERBOSE, "MDIO Read reg8, "
-				   "'bank' 31 %x\n", (u32) val);
-	} else
-		ipath_dbg("Never got MDIO cmdready for XGXS status read\n");
-
 	return ret;
 }
 
@@ -902,11 +933,26 @@ static int ipath_setup_pe_config(struct ipath_devdata *dd,
 	else
 		ipath_dev_err(dd, "Can't find PCI Express "
 			      "capability!\n");
+
+	dd->ipath_link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+	dd->ipath_link_speed_supported = IPATH_IB_SDR;
+	dd->ipath_link_width_enabled = IB_WIDTH_4X;
+	dd->ipath_link_speed_enabled = dd->ipath_link_speed_supported;
+	/* these can't change for this chip, so set once */
+	dd->ipath_link_width_active = dd->ipath_link_width_enabled;
+	dd->ipath_link_speed_active = dd->ipath_link_speed_enabled;
 	return 0;
 }
 
 static void ipath_init_pe_variables(struct ipath_devdata *dd)
 {
+	/*
+	 * setup the register offsets, since they are different for each
+	 * chip
+	 */
+	dd->ipath_kregs = &ipath_pe_kregs;
+	dd->ipath_cregs = &ipath_pe_cregs;
+
 	/*
 	 * bits for selecting i2c direction and values,
 	 * used for I2C serial flash
@@ -916,6 +962,43 @@ static void ipath_init_pe_variables(struct ipath_devdata *dd)
 	dd->ipath_gpio_sda = IPATH_GPIO_SDA;
 	dd->ipath_gpio_scl = IPATH_GPIO_SCL;
 
+	/*
+	 * Fill in data for field-values that change in newer chips.
+	 * We dynamically specify only the mask for LINKTRAININGSTATE
+	 * and only the shift for LINKSTATE, as they are the only ones
+	 * that change.  Also precalculate the 3 link states of interest
+	 * and the combined mask.
+	 */
+	dd->ibcs_ls_shift = IBA6120_IBCS_LINKSTATE_SHIFT;
+	dd->ibcs_lts_mask = IBA6120_IBCS_LINKTRAININGSTATE_MASK;
+	dd->ibcs_mask = (INFINIPATH_IBCS_LINKSTATE_MASK <<
+		dd->ibcs_ls_shift) | dd->ibcs_lts_mask;
+	dd->ib_init = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+		(INFINIPATH_IBCS_L_STATE_INIT << dd->ibcs_ls_shift);
+	dd->ib_arm = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+		(INFINIPATH_IBCS_L_STATE_ARM << dd->ibcs_ls_shift);
+	dd->ib_active = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+		(INFINIPATH_IBCS_L_STATE_ACTIVE << dd->ibcs_ls_shift);
+
+	/*
+	 * Fill in data for ibcc field-values that change in newer chips.
+	 * We dynamically specify only the mask for LINKINITCMD
+	 * and only the shift for LINKCMD and MAXPKTLEN, as they are
+	 * the only ones that change.
+	 */
+	dd->ibcc_lic_mask = INFINIPATH_IBCC_LINKINITCMD_MASK;
+	dd->ibcc_lc_shift = INFINIPATH_IBCC_LINKCMD_SHIFT;
+	dd->ibcc_mpl_shift = INFINIPATH_IBCC_MAXPKTLEN_SHIFT;
+
+	/* Fill in shifts for RcvCtrl. */
+	dd->ipath_r_portenable_shift = INFINIPATH_R_PORTENABLE_SHIFT;
+	dd->ipath_r_intravail_shift = INFINIPATH_R_INTRAVAIL_SHIFT;
+	dd->ipath_r_tailupd_shift = INFINIPATH_R_TAILUPD_SHIFT;
+	dd->ipath_r_portcfg_shift = 0; /* Not on IBA6120 */
+
 	/* variables for sanity checking interrupt and errors */
 	dd->ipath_hwe_bitsextant =
 		(INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
@@ -963,6 +1046,8 @@ static void ipath_init_pe_variables(struct ipath_devdata *dd)
 
 	dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
 	dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
+	dd->ipath_i_rcvavail_shift = INFINIPATH_I_RCVAVAIL_SHIFT;
+	dd->ipath_i_rcvurg_shift = INFINIPATH_I_RCVURG_SHIFT;
 
 	/*
 	 * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
@@ -984,6 +1069,7 @@ static void ipath_init_pe_variables(struct ipath_devdata *dd)
 		INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET;
 
 
+	dd->delay_mult = 2; /* SDR, 4X, can't change */
 }
 
 /* setup the MSI stuff again after a reset.  I'd like to just call
@@ -1289,6 +1375,9 @@ static int ipath_pe_early_init(struct ipath_devdata *dd)
 	 */
 	dd->ipath_rcvhdrentsize = 24;
 	dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE;
+	dd->ipath_rhf_offset = 0;
+	dd->ipath_egrtidbase = (u64 __iomem *)
+		((char __iomem *) dd->ipath_kregbase + dd->ipath_rcvegrbase);
 
 	/*
 	 * To truly support a 4KB MTU (for usermode), we need to
@@ -1359,34 +1448,204 @@ static void ipath_pe_free_irq(struct ipath_devdata *dd)
 	dd->ipath_irq = 0;
 }
 
+
+static struct ipath_message_header *
+ipath_pe_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr)
+{
+	return (struct ipath_message_header *)
+		&rhf_addr[sizeof(u64) / sizeof(u32)];
+}
+
+static void ipath_pe_config_ports(struct ipath_devdata *dd, ushort cfgports)
+{
+	dd->ipath_portcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt);
+	dd->ipath_p0_rcvegrcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+}
+
+static void ipath_pe_read_counters(struct ipath_devdata *dd,
+				   struct infinipath_counters *cntrs)
+{
+	cntrs->LBIntCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBIntCnt));
+	cntrs->LBFlowStallCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBFlowStallCnt));
+	cntrs->TxSDmaDescCnt = 0;
+	cntrs->TxUnsupVLErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnsupVLErrCnt));
+	cntrs->TxDataPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDataPktCnt));
+	cntrs->TxFlowPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowPktCnt));
+	cntrs->TxDwordCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDwordCnt));
+	cntrs->TxLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxLenErrCnt));
+	cntrs->TxMaxMinLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxMaxMinLenErrCnt));
+	cntrs->TxUnderrunCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnderrunCnt));
+	cntrs->TxFlowStallCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowStallCnt));
+	cntrs->TxDroppedPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDroppedPktCnt));
+	cntrs->RxDroppedPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDroppedPktCnt));
+	cntrs->RxDataPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDataPktCnt));
+	cntrs->RxFlowPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowPktCnt));
+	cntrs->RxDwordCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDwordCnt));
+	cntrs->RxLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLenErrCnt));
+	cntrs->RxMaxMinLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxMaxMinLenErrCnt));
+	cntrs->RxICRCErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxICRCErrCnt));
+	cntrs->RxVCRCErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxVCRCErrCnt));
+	cntrs->RxFlowCtrlErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowCtrlErrCnt));
+	cntrs->RxBadFormatCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBadFormatCnt));
+	cntrs->RxLinkProblemCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLinkProblemCnt));
+	cntrs->RxEBPCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxEBPCnt));
+	cntrs->RxLPCRCErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLPCRCErrCnt));
+	cntrs->RxBufOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBufOvflCnt));
+	cntrs->RxTIDFullErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDFullErrCnt));
+	cntrs->RxTIDValidErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDValidErrCnt));
+	cntrs->RxPKeyMismatchCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxPKeyMismatchCnt));
+	cntrs->RxP0HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt));
+	cntrs->RxP1HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP1HdrEgrOvflCnt));
+	cntrs->RxP2HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP2HdrEgrOvflCnt));
+	cntrs->RxP3HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP3HdrEgrOvflCnt));
+	cntrs->RxP4HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP4HdrEgrOvflCnt));
+	cntrs->RxP5HdrEgrOvflCnt = 0;
+	cntrs->RxP6HdrEgrOvflCnt = 0;
+	cntrs->RxP7HdrEgrOvflCnt = 0;
+	cntrs->RxP8HdrEgrOvflCnt = 0;
+	cntrs->RxP9HdrEgrOvflCnt = 0;
+	cntrs->RxP10HdrEgrOvflCnt = 0;
+	cntrs->RxP11HdrEgrOvflCnt = 0;
+	cntrs->RxP12HdrEgrOvflCnt = 0;
+	cntrs->RxP13HdrEgrOvflCnt = 0;
+	cntrs->RxP14HdrEgrOvflCnt = 0;
+	cntrs->RxP15HdrEgrOvflCnt = 0;
+	cntrs->RxP16HdrEgrOvflCnt = 0;
+	cntrs->IBStatusChangeCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBStatusChangeCnt));
+	cntrs->IBLinkErrRecoveryCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt));
+	cntrs->IBLinkDownedCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkDownedCnt));
+	cntrs->IBSymbolErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBSymbolErrCnt));
+	cntrs->RxVL15DroppedPktCnt = 0;
+	cntrs->RxOtherLocalPhyErrCnt = 0;
+	cntrs->PcieRetryBufDiagQwordCnt = 0;
+	cntrs->ExcessBufferOvflCnt = dd->ipath_overrun_thresh_errs;
+	cntrs->LocalLinkIntegrityErrCnt = dd->ipath_lli_errs;
+	cntrs->RxVlErrCnt = 0;
+	cntrs->RxDlidFltrCnt = 0;
+}
+
+
+/* no interrupt fallback for these chips */
+static int ipath_pe_nointr_fallback(struct ipath_devdata *dd)
+{
+	return 0;
+}
+
+
 /*
- * On platforms using this chip, and not having ordered WC stores, we
- * can get TXE parity errors due to speculative reads to the PIO buffers,
- * and this, due to a chip bug can result in (many) false parity error
- * reports.  So it's a debug print on those, and an info print on systems
- * where the speculative reads don't occur.
- * Because we can get lots of false errors, we have no upper limit
- * on recovery attempts on those platforms.
+ * reset the XGXS (between serdes and IBC).  Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining.  To do this right, we reset IBC
+ * as well.
  */
-static int ipath_pe_txe_recover(struct ipath_devdata *dd)
+static void ipath_pe_xgxs_reset(struct ipath_devdata *dd)
 {
-	if (ipath_unordered_wc())
-		ipath_dbg("Recovering from TXE PIO parity error\n");
-	else {
-		int cnt = ++ipath_stats.sps_txeparity;
-		if (cnt >= IPATH_MAX_PARITY_ATTEMPTS)  {
-			if (cnt == IPATH_MAX_PARITY_ATTEMPTS)
-				ipath_dev_err(dd,
-					"Too many attempts to recover from "
-					"TXE parity, giving up\n");
-			return 0;
-		}
-		dev_info(&dd->pcidev->dev,
-			"Recovering from TXE PIO parity error\n");
+	u64 val, prev_val;
+
+	prev_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+	val = prev_val | INFINIPATH_XGXS_RESET;
+	prev_val &= ~INFINIPATH_XGXS_RESET; /* be sure */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control & ~INFINIPATH_C_LINKENABLE);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+	ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, prev_val);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control);
+}
+
+
+static int ipath_pe_get_ib_cfg(struct ipath_devdata *dd, int which)
+{
+	int ret;
+
+	switch (which) {
+	case IPATH_IB_CFG_LWID:
+		ret = dd->ipath_link_width_active;
+		break;
+	case IPATH_IB_CFG_SPD:
+		ret = dd->ipath_link_speed_active;
+		break;
+	case IPATH_IB_CFG_LWID_ENB:
+		ret = dd->ipath_link_width_enabled;
+		break;
+	case IPATH_IB_CFG_SPD_ENB:
+		ret = dd->ipath_link_speed_enabled;
+		break;
+	default:
+		ret =  -ENOTSUPP;
+		break;
 	}
-	return 1;
+	return ret;
+}
+
+
+/* we assume range checking is already done, if needed */
+static int ipath_pe_set_ib_cfg(struct ipath_devdata *dd, int which, u32 val)
+{
+	int ret = 0;
+
+	if (which == IPATH_IB_CFG_LWID_ENB)
+		dd->ipath_link_width_enabled = val;
+	else if (which == IPATH_IB_CFG_SPD_ENB)
+		dd->ipath_link_speed_enabled = val;
+	else
+		ret = -ENOTSUPP;
+	return ret;
 }
 
+static void ipath_pe_config_jint(struct ipath_devdata *dd, u16 a, u16 b)
+{
+}
+
+
+static int ipath_pe_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
+{
+	ipath_setup_pe_setextled(dd, ipath_ib_linkstate(dd, ibcs),
+		ipath_ib_linktrstate(dd, ibcs));
+	return 0;
+}
+
+
 /**
  * ipath_init_iba6120_funcs - set up the chip-specific function pointers
  * @dd: the infinipath device
@@ -1407,7 +1666,7 @@ void ipath_init_iba6120_funcs(struct ipath_devdata *dd)
 	dd->ipath_f_bringup_serdes = ipath_pe_bringup_serdes;
 	dd->ipath_f_clear_tids = ipath_pe_clear_tids;
 	/*
-	 * this may get changed after we read the chip revision,
+	 * _f_put_tid may get changed after we read the chip revision,
 	 * but we start with the safe version for all revs
 	 */
 	dd->ipath_f_put_tid = ipath_pe_put_tid;
@@ -1415,17 +1674,19 @@ void ipath_init_iba6120_funcs(struct ipath_devdata *dd)
 	dd->ipath_f_setextled = ipath_setup_pe_setextled;
 	dd->ipath_f_get_base_info = ipath_pe_get_base_info;
 	dd->ipath_f_free_irq = ipath_pe_free_irq;
-
-	/* initialize chip-specific variables */
 	dd->ipath_f_tidtemplate = ipath_pe_tidtemplate;
+	dd->ipath_f_intr_fallback = ipath_pe_nointr_fallback;
+	dd->ipath_f_xgxs_reset = ipath_pe_xgxs_reset;
+	dd->ipath_f_get_msgheader = ipath_pe_get_msgheader;
+	dd->ipath_f_config_ports = ipath_pe_config_ports;
+	dd->ipath_f_read_counters = ipath_pe_read_counters;
+	dd->ipath_f_get_ib_cfg = ipath_pe_get_ib_cfg;
+	dd->ipath_f_set_ib_cfg = ipath_pe_set_ib_cfg;
+	dd->ipath_f_config_jint = ipath_pe_config_jint;
+	dd->ipath_f_ib_updown = ipath_pe_ib_updown;
 
-	/*
-	 * setup the register offsets, since they are different for each
-	 * chip
-	 */
-	dd->ipath_kregs = &ipath_pe_kregs;
-	dd->ipath_cregs = &ipath_pe_cregs;
 
+	/* initialize chip-specific variables */
 	ipath_init_pe_variables(dd);
 }
 

+ 27 - 40
drivers/infiniband/hw/ipath/ipath_init_chip.c

@@ -91,7 +91,7 @@ static int create_port0_egr(struct ipath_devdata *dd)
 	struct ipath_skbinfo *skbinfo;
 	int ret;
 
-	egrcnt = dd->ipath_rcvegrcnt;
+	egrcnt = dd->ipath_p0_rcvegrcnt;
 
 	skbinfo = vmalloc(sizeof(*dd->ipath_port0_skbinfo) * egrcnt);
 	if (skbinfo == NULL) {
@@ -244,8 +244,7 @@ static int init_chip_first(struct ipath_devdata *dd,
 	 * cfgports.  We do still check and report a difference, if
 	 * not same (should be impossible).
 	 */
-	dd->ipath_portcnt =
-		ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt);
+	dd->ipath_f_config_ports(dd, ipath_cfgports);
 	if (!ipath_cfgports)
 		dd->ipath_cfgports = dd->ipath_portcnt;
 	else if (ipath_cfgports <= dd->ipath_portcnt) {
@@ -272,22 +271,7 @@ static int init_chip_first(struct ipath_devdata *dd,
 		goto done;
 	}
 
-	dd->ipath_lastegrheads = kzalloc(sizeof(*dd->ipath_lastegrheads)
-					 * dd->ipath_cfgports,
-					 GFP_KERNEL);
-	dd->ipath_lastrcvhdrqtails =
-		kzalloc(sizeof(*dd->ipath_lastrcvhdrqtails)
-			* dd->ipath_cfgports, GFP_KERNEL);
-
-	if (!dd->ipath_lastegrheads || !dd->ipath_lastrcvhdrqtails) {
-		ipath_dev_err(dd, "Unable to allocate head arrays, "
-			      "failing\n");
-		ret = -ENOMEM;
-		goto done;
-	}
-
 	pd = create_portdata0(dd);
-
 	if (!pd) {
 		ipath_dev_err(dd, "Unable to allocate portdata for port "
 			      "0, failing\n");
@@ -345,10 +329,10 @@ static int init_chip_first(struct ipath_devdata *dd,
 		       dd->ipath_piobcnt2k, dd->ipath_pio2kbase);
 
 	spin_lock_init(&dd->ipath_tid_lock);
-
+	spin_lock_init(&dd->ipath_sendctrl_lock);
 	spin_lock_init(&dd->ipath_gpio_lock);
 	spin_lock_init(&dd->ipath_eep_st_lock);
-	sema_init(&dd->ipath_eep_sem, 1);
+	mutex_init(&dd->ipath_eep_lock);
 
 done:
 	*pdp = pd;
@@ -372,9 +356,9 @@ static int init_chip_reset(struct ipath_devdata *dd,
 	*pdp = dd->ipath_pd[0];
 	/* ensure chip does no sends or receives while we re-initialize */
 	dd->ipath_control = dd->ipath_sendctrl = dd->ipath_rcvctrl = 0U;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, 0);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, 0);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, dd->ipath_rcvctrl);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control, dd->ipath_control);
 
 	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt);
 	if (dd->ipath_portcnt != rtmp)
@@ -487,6 +471,7 @@ static void enable_chip(struct ipath_devdata *dd,
 			struct ipath_portdata *pd, int reinit)
 {
 	u32 val;
+	unsigned long flags;
 	int i;
 
 	if (!reinit)
@@ -495,19 +480,21 @@ static void enable_chip(struct ipath_devdata *dd,
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
 			 dd->ipath_rcvctrl);
 
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
 	/* Enable PIO send, and update of PIOavail regs to memory. */
 	dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE |
 		INFINIPATH_S_PIOBUFAVAILUPD;
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-			 dd->ipath_sendctrl);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
 
 	/*
 	 * enable port 0 receive, and receive interrupt.  other ports
 	 * done as user opens and inits them.
 	 */
-	dd->ipath_rcvctrl = INFINIPATH_R_TAILUPD |
-		(1ULL << INFINIPATH_R_PORTENABLE_SHIFT) |
-		(1ULL << INFINIPATH_R_INTRAVAIL_SHIFT);
+	dd->ipath_rcvctrl = (1ULL << dd->ipath_r_tailupd_shift) |
+		(1ULL << dd->ipath_r_portenable_shift) |
+		(1ULL << dd->ipath_r_intravail_shift);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
 			 dd->ipath_rcvctrl);
 
@@ -523,12 +510,11 @@ static void enable_chip(struct ipath_devdata *dd,
 	 */
 	val = ipath_read_ureg32(dd, ur_rcvegrindextail, 0);
 	(void)ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0);
-	dd->ipath_port0head = ipath_read_ureg32(dd, ur_rcvhdrtail, 0);
 
 	/* Initialize so we interrupt on next packet received */
 	(void)ipath_write_ureg(dd, ur_rcvhdrhead,
 			       dd->ipath_rhdrhead_intr_off |
-			       dd->ipath_port0head, 0);
+			       dd->ipath_pd[0]->port_head, 0);
 
 	/*
 	 * by now pioavail updates to memory should have occurred, so
@@ -542,12 +528,8 @@ static void enable_chip(struct ipath_devdata *dd,
 		/*
 		 * Chip Errata bug 6641; even and odd qwords>3 are swapped.
 		 */
-		if (i > 3) {
-			if (i & 1)
-				val = dd->ipath_pioavailregs_dma[i - 1];
-			else
-				val = dd->ipath_pioavailregs_dma[i + 1];
-		}
+		if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
+			val = dd->ipath_pioavailregs_dma[i ^ 1];
 		else
 			val = dd->ipath_pioavailregs_dma[i];
 		dd->ipath_pioavailshadow[i] = le64_to_cpu(val);
@@ -690,12 +672,13 @@ done:
  */
 int ipath_init_chip(struct ipath_devdata *dd, int reinit)
 {
-	int ret = 0, i;
+	int ret = 0;
 	u32 val32, kpiobufs;
 	u32 piobufs, uports;
 	u64 val;
 	struct ipath_portdata *pd = NULL; /* keep gcc4 happy */
 	gfp_t gfp_flags = GFP_USER | __GFP_COMP;
+	unsigned long flags;
 
 	ret = init_housekeeping(dd, &pd, reinit);
 	if (ret)
@@ -746,7 +729,7 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit)
 		kpiobufs = ipath_kpiobufs;
 
 	if (kpiobufs + (uports * IPATH_MIN_USER_PORT_BUFCNT) > piobufs) {
-		i = (int) piobufs -
+		int i = (int) piobufs -
 			(int) (uports * IPATH_MIN_USER_PORT_BUFCNT);
 		if (i < 0)
 			i = 0;
@@ -827,8 +810,12 @@ int ipath_init_chip(struct ipath_devdata *dd, int reinit)
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
 			 ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
-	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-			 INFINIPATH_S_PIOENABLE);
+
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
 
 	/*
 	 * before error clears, since we expect serdes pll errors during

+ 39 - 42
drivers/infiniband/hw/ipath/ipath_intr.c

@@ -683,7 +683,7 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
 		for (i = 0; i < dd->ipath_cfgports; i++) {
 			struct ipath_portdata *pd = dd->ipath_pd[i];
 			if (i == 0) {
-				hd = dd->ipath_port0head;
+				hd = pd->port_head;
 				tl = (u32) le64_to_cpu(
 					*dd->ipath_hdrqtailptr);
 			} else if (pd && pd->port_cnt &&
@@ -693,7 +693,7 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
 				 * except kernel
 				 */
 				tl = *(u64 *) pd->port_rcvhdrtail_kvaddr;
-				if (tl == dd->ipath_lastrcvhdrqtails[i])
+				if (tl == pd->port_lastrcvhdrqtail)
 					continue;
 				hd = ipath_read_ureg32(dd, ur_rcvhdrhead,
 						       i);
@@ -703,7 +703,7 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
 			    (!hd && tl == dd->ipath_hdrqlast)) {
 				if (i == 0)
 					chkerrpkts = 1;
-				dd->ipath_lastrcvhdrqtails[i] = tl;
+				pd->port_lastrcvhdrqtail = tl;
 				pd->port_hdrqfull++;
 				/* flush hdrqfull so that poll() sees it */
 				wmb();
@@ -712,6 +712,8 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
 		}
 	}
 	if (errs & INFINIPATH_E_RRCVEGRFULL) {
+		struct ipath_portdata *pd = dd->ipath_pd[0];
+
 		/*
 		 * since this is of less importance and not likely to
 		 * happen without also getting hdrfull, only count
@@ -719,7 +721,7 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
 		 * vs user)
 		 */
 		ipath_stats.sps_etidfull++;
-		if (dd->ipath_port0head !=
+		if (pd->port_head !=
 		    (u32) le64_to_cpu(*dd->ipath_hdrqtailptr))
 			chkerrpkts = 1;
 	}
@@ -795,6 +797,7 @@ void ipath_clear_freeze(struct ipath_devdata *dd)
 {
 	int i, im;
 	__le64 val;
+	unsigned long flags;
 
 	/* disable error interrupts, to avoid confusion */
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL);
@@ -813,11 +816,14 @@ void ipath_clear_freeze(struct ipath_devdata *dd)
 			 dd->ipath_control);
 
 	/* ensure pio avail updates continue */
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
 		 dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD);
 	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
-		 dd->ipath_sendctrl);
+			 dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
 
 	/*
 	 * We just enabled pioavailupdate, so dma copy is almost certainly
@@ -825,8 +831,8 @@ void ipath_clear_freeze(struct ipath_devdata *dd)
 	 */
 	for (i = 0; i < dd->ipath_pioavregs; i++) {
 		/* deal with 6110 chip bug */
-		im = i > 3 ? ((i&1) ? i-1 : i+1) : i;
-		val = ipath_read_kreg64(dd, (0x1000/sizeof(u64))+im);
+		im = i > 3 ? i ^ 1 : i;
+		val = ipath_read_kreg64(dd, (0x1000 / sizeof(u64)) + im);
 		dd->ipath_pioavailregs_dma[i] = dd->ipath_pioavailshadow[i]
 			= le64_to_cpu(val);
 	}
@@ -849,7 +855,7 @@ void ipath_clear_freeze(struct ipath_devdata *dd)
 
 /* this is separate to allow for better optimization of ipath_intr() */
 
-static void ipath_bad_intr(struct ipath_devdata *dd, u32 * unexpectp)
+static noinline void ipath_bad_intr(struct ipath_devdata *dd, u32 *unexpectp)
 {
 	/*
 	 * sometimes happen during driver init and unload, don't want
@@ -877,7 +883,7 @@ static void ipath_bad_intr(struct ipath_devdata *dd, u32 * unexpectp)
 				dd->ipath_f_free_irq(dd);
 			}
 		}
-		if (ipath_read_kreg32(dd, dd->ipath_kregs->kr_intmask)) {
+		if (ipath_read_ireg(dd, dd->ipath_kregs->kr_intmask)) {
 			ipath_dev_err(dd, "%u unexpected interrupts, "
 				      "disabling interrupts completely\n",
 				      *unexpectp);
@@ -892,7 +898,7 @@ static void ipath_bad_intr(struct ipath_devdata *dd, u32 * unexpectp)
 			  "ignoring\n");
 }
 
-static void ipath_bad_regread(struct ipath_devdata *dd)
+static noinline void ipath_bad_regread(struct ipath_devdata *dd)
 {
 	static int allbits;
 
@@ -920,31 +926,9 @@ static void ipath_bad_regread(struct ipath_devdata *dd)
 	}
 }
 
-static void handle_port_pioavail(struct ipath_devdata *dd)
-{
-	u32 i;
-	/*
-	 * start from port 1, since for now port 0  is never using
-	 * wait_event for PIO
-	 */
-	for (i = 1; dd->ipath_portpiowait && i < dd->ipath_cfgports; i++) {
-		struct ipath_portdata *pd = dd->ipath_pd[i];
-
-		if (pd && pd->port_cnt &&
-		    dd->ipath_portpiowait & (1U << i)) {
-			clear_bit(i, &dd->ipath_portpiowait);
-			if (test_bit(IPATH_PORT_WAITING_PIO,
-				     &pd->port_flag)) {
-				clear_bit(IPATH_PORT_WAITING_PIO,
-					  &pd->port_flag);
-				wake_up_interruptible(&pd->port_wait);
-			}
-		}
-	}
-}
-
 static void handle_layer_pioavail(struct ipath_devdata *dd)
 {
+	unsigned long flags;
 	int ret;
 
 	ret = ipath_ib_piobufavail(dd->verbs_dev);
@@ -953,9 +937,12 @@ static void handle_layer_pioavail(struct ipath_devdata *dd)
 
 	return;
 set:
-	set_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl);
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
 			 dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
 }
 
 /*
@@ -969,7 +956,15 @@ static void handle_urcv(struct ipath_devdata *dd, u32 istat)
 	int i;
 	int rcvdint = 0;
 
-	/* test_bit below needs this... */
+	/*
+	 * test_and_clear_bit(IPATH_PORT_WAITING_RCV) and
+	 * test_and_clear_bit(IPATH_PORT_WAITING_URG) below
+	 * would both like timely updates of the bits so that
+	 * we don't pass them by unnecessarily.  the rmb()
+	 * here ensures that we see them promptly -- the
+	 * corresponding wmb()'s are in ipath_poll_urgent()
+	 * and ipath_poll_next()...
+	 */
 	rmb();
 	portr = ((istat >> INFINIPATH_I_RCVAVAIL_SHIFT) &
 		 dd->ipath_i_rcvavail_mask)
@@ -980,7 +975,7 @@ static void handle_urcv(struct ipath_devdata *dd, u32 istat)
 		if (portr & (1 << i) && pd && pd->port_cnt) {
 			if (test_and_clear_bit(IPATH_PORT_WAITING_RCV,
 					       &pd->port_flag)) {
-				clear_bit(i + INFINIPATH_R_INTRAVAIL_SHIFT,
+				clear_bit(i + dd->ipath_r_intravail_shift,
 					  &dd->ipath_rcvctrl);
 				wake_up_interruptible(&pd->port_wait);
 				rcvdint = 1;
@@ -1039,7 +1034,7 @@ irqreturn_t ipath_intr(int irq, void *data)
 		goto bail;
 	}
 
-	istat = ipath_read_kreg32(dd, dd->ipath_kregs->kr_intstatus);
+	istat = ipath_read_ireg(dd, dd->ipath_kregs->kr_intstatus);
 
 	if (unlikely(!istat)) {
 		ipath_stats.sps_nullintr++;
@@ -1180,7 +1175,7 @@ irqreturn_t ipath_intr(int irq, void *data)
 	 * for receive are at the bottom.
 	 */
 	if (chk0rcv) {
-		ipath_kreceive(dd);
+		ipath_kreceive(dd->ipath_pd[0]);
 		istat &= ~port0rbits;
 	}
 
@@ -1191,12 +1186,14 @@ irqreturn_t ipath_intr(int irq, void *data)
 		handle_urcv(dd, istat);
 
 	if (istat & INFINIPATH_I_SPIOBUFAVAIL) {
-		clear_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl);
+		unsigned long flags;
+
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl &= ~INFINIPATH_S_PIOINTBUFAVAIL;
 		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
 				 dd->ipath_sendctrl);
-
-		if (dd->ipath_portpiowait)
-			handle_port_pioavail(dd);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
 
 		handle_layer_pioavail(dd);
 	}

+ 149 - 52
drivers/infiniband/hw/ipath/ipath_kernel.h

@@ -41,6 +41,7 @@
 #include <linux/interrupt.h>
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
+#include <linux/mutex.h>
 #include <asm/io.h>
 #include <rdma/ib_verbs.h>
 
@@ -140,6 +141,11 @@ struct ipath_portdata {
 	u32 port_pionowait;
 	/* total number of rcvhdrqfull errors */
 	u32 port_hdrqfull;
+	/*
+	 * Used to suppress multiple instances of same
+	 * port staying stuck at same point.
+	 */
+	u32 port_lastrcvhdrqtail;
 	/* saved total number of rcvhdrqfull errors for poll edge trigger */
 	u32 port_hdrqfull_poll;
 	/* total number of polled urgent packets */
@@ -148,6 +154,7 @@ struct ipath_portdata {
 	u32 port_urgent_poll;
 	/* pid of process using this port */
 	pid_t port_pid;
+	pid_t port_subpid[INFINIPATH_MAX_SUBPORT];
 	/* same size as task_struct .comm[] */
 	char port_comm[16];
 	/* pkeys set by this use of this port */
@@ -166,6 +173,8 @@ struct ipath_portdata {
 	u32 active_slaves;
 	/* Type of packets or conditions we want to poll for */
 	u16 poll_type;
+	/* port rcvhdrq head offset */
+	u32 port_head;
 };
 
 struct sk_buff;
@@ -182,6 +191,22 @@ struct ipath_skbinfo {
 	dma_addr_t phys;
 };
 
+/*
+ * Possible IB config parameters for ipath_f_get/set_ib_cfg()
+ */
+#define IPATH_IB_CFG_LIDLMC 0 /* Get/set LID (LS16b) and Mask (MS16b) */
+#define IPATH_IB_CFG_HRTBT 1 /* Get/set Heartbeat off/enable/auto */
+#define IPATH_IB_HRTBT_ON 3 /* Heartbeat enabled, sent every 100msec */
+#define IPATH_IB_HRTBT_OFF 0 /* Heartbeat off */
+#define IPATH_IB_CFG_LWID_ENB 2 /* Get/set allowed Link-width */
+#define IPATH_IB_CFG_LWID 3 /* Get currently active Link-width */
+#define IPATH_IB_CFG_SPD_ENB 4 /* Get/set allowed Link speeds */
+#define IPATH_IB_CFG_SPD 5 /* Get current Link spd */
+#define IPATH_IB_CFG_RXPOL_ENB 6 /* Get/set Auto-RX-polarity enable */
+#define IPATH_IB_CFG_LREV_ENB 7 /* Get/set Auto-Lane-reversal enable */
+#define IPATH_IB_CFG_LINKLATENCY 8 /* Get Auto-Lane-reversal enable */
+
+
 struct ipath_devdata {
 	struct list_head ipath_list;
 
@@ -222,6 +247,8 @@ struct ipath_devdata {
 	struct _ipath_layer ipath_layer;
 	/* setup intr */
 	int (*ipath_f_intrsetup)(struct ipath_devdata *);
+	/* fallback to alternate interrupt type if possible */
+	int (*ipath_f_intr_fallback)(struct ipath_devdata *);
 	/* setup on-chip bus config */
 	int (*ipath_f_bus)(struct ipath_devdata *, struct pci_dev *);
 	/* hard reset chip */
@@ -244,6 +271,18 @@ struct ipath_devdata {
 	int (*ipath_f_get_base_info)(struct ipath_portdata *, void *);
 	/* free irq */
 	void (*ipath_f_free_irq)(struct ipath_devdata *);
+	struct ipath_message_header *(*ipath_f_get_msgheader)
+					(struct ipath_devdata *, __le32 *);
+	void (*ipath_f_config_ports)(struct ipath_devdata *, ushort);
+	int (*ipath_f_get_ib_cfg)(struct ipath_devdata *, int);
+	int (*ipath_f_set_ib_cfg)(struct ipath_devdata *, int, u32);
+	void (*ipath_f_config_jint)(struct ipath_devdata *, u16 , u16);
+	void (*ipath_f_read_counters)(struct ipath_devdata *,
+					struct infinipath_counters *);
+	void (*ipath_f_xgxs_reset)(struct ipath_devdata *);
+	/* per chip actions needed for IB Link up/down changes */
+	int (*ipath_f_ib_updown)(struct ipath_devdata *, int, u64);
+
 	struct ipath_ibdev *verbs_dev;
 	struct timer_list verbs_timer;
 	/* total dwords sent (summed from counter) */
@@ -313,21 +352,11 @@ struct ipath_devdata {
 	 * supports, less gives more pio bufs/port, etc.
 	 */
 	u32 ipath_cfgports;
-	/* port0 rcvhdrq head offset */
-	u32 ipath_port0head;
 	/* count of port 0 hdrqfull errors */
 	u32 ipath_p0_hdrqfull;
+	/* port 0 number of receive eager buffers */
+	u32 ipath_p0_rcvegrcnt;
 
-	/*
-	 * (*cfgports) used to suppress multiple instances of same
-	 * port staying stuck at same point
-	 */
-	u32 *ipath_lastrcvhdrqtails;
-	/*
-	 * (*cfgports) used to suppress multiple instances of same
-	 * port staying stuck at same point
-	 */
-	u32 *ipath_lastegrheads;
 	/*
 	 * index of last piobuffer we used.  Speeds up searching, by
 	 * starting at this point.  Doesn't matter if multiple cpu's use and
@@ -367,14 +396,15 @@ struct ipath_devdata {
 	unsigned long ipath_wc_len;
 	/* ref count for each pkey */
 	atomic_t ipath_pkeyrefs[4];
-	/* shadow copy of all exptids physaddr; used only by funcsim */
-	u64 *ipath_tidsimshadow;
 	/* shadow copy of struct page *'s for exp tid pages */
 	struct page **ipath_pageshadow;
 	/* shadow copy of dma handles for exp tid pages */
 	dma_addr_t *ipath_physshadow;
-	/* lock to workaround chip bug 9437 */
+	u64 __iomem *ipath_egrtidbase;
+	/* lock to workaround chip bug 9437 and others */
+	spinlock_t ipath_kernel_tid_lock;
 	spinlock_t ipath_tid_lock;
+	spinlock_t ipath_sendctrl_lock;
 
 	/*
 	 * IPATH_STATUS_*,
@@ -395,6 +425,8 @@ struct ipath_devdata {
 	void *ipath_dummy_hdrq;	/* used after port close */
 	dma_addr_t ipath_dummy_hdrq_phys;
 
+	unsigned long ipath_ureg_align; /* user register alignment */
+
 	/*
 	 * Shadow copies of registers; size indicates read access size.
 	 * Most of them are readonly, but some are write-only register,
@@ -456,8 +488,6 @@ struct ipath_devdata {
 	unsigned long ipath_rcvctrl;
 	/* shadow kr_sendctrl */
 	unsigned long ipath_sendctrl;
-	/* ports waiting for PIOavail intr */
-	unsigned long ipath_portpiowait;
 	unsigned long ipath_lastcancel; /* to not count armlaunch after cancel */
 
 	/* value we put in kr_rcvhdrcnt */
@@ -550,12 +580,26 @@ struct ipath_devdata {
 	u8 ipath_minrev;
 	/* board rev, from ipath_revision */
 	u8 ipath_boardrev;
+
+	u8 ipath_r_portenable_shift;
+	u8 ipath_r_intravail_shift;
+	u8 ipath_r_tailupd_shift;
+	u8 ipath_r_portcfg_shift;
+
 	/* unit # of this chip, if present */
 	int ipath_unit;
 	/* saved for restore after reset */
 	u8 ipath_pci_cacheline;
 	/* LID mask control */
 	u8 ipath_lmc;
+	/* link width supported */
+	u8 ipath_link_width_supported;
+	/* link speed supported */
+	u8 ipath_link_speed_supported;
+	u8 ipath_link_width_enabled;
+	u8 ipath_link_speed_enabled;
+	u8 ipath_link_width_active;
+	u8 ipath_link_speed_active;
 	/* Rx Polarity inversion (compensate for ~tx on partner) */
 	u8 ipath_rx_pol_inv;
 
@@ -590,6 +634,8 @@ struct ipath_devdata {
 	 */
 	u32 ipath_i_rcvavail_mask;
 	u32 ipath_i_rcvurg_mask;
+	u16 ipath_i_rcvurg_shift;
+	u16 ipath_i_rcvavail_shift;
 
 	/*
 	 * Register bits for selecting i2c direction and values, used for
@@ -603,6 +649,29 @@ struct ipath_devdata {
 	/* lock for doing RMW of shadows/regs for ExtCtrl and GPIO */
 	spinlock_t ipath_gpio_lock;
 
+	/*
+	 * IB link and linktraining states and masks that vary per chip in
+	 * some way.  Set at init, to avoid each IB status change interrupt
+	 */
+	u8 ibcs_ls_shift;
+	u8 ibcs_lts_mask;
+	u32 ibcs_mask;
+	u32 ib_init;
+	u32 ib_arm;
+	u32 ib_active;
+
+	u16 ipath_rhf_offset; /* offset of RHF within receive header entry */
+
+	/*
+	 * shift/mask for linkcmd, linkinitcmd, maxpktlen in ibccontol
+	 * reg. Changes for IBA7220
+	 */
+	u8 ibcc_lic_mask; /* LinkInitCmd */
+	u8 ibcc_lc_shift; /* LinkCmd */
+	u8 ibcc_mpl_shift; /* Maxpktlen */
+
+	u8 delay_mult;
+
 	/* used to override LED behavior */
 	u8 ipath_led_override;  /* Substituted for normal value, if non-zero */
 	u16 ipath_led_override_timeoff; /* delta to next timer event */
@@ -616,7 +685,7 @@ struct ipath_devdata {
 	/* control access to actual counters, timer */
 	spinlock_t ipath_eep_st_lock;
 	/* control high-level access to EEPROM */
-	struct semaphore ipath_eep_sem;
+	struct mutex ipath_eep_lock;
 	/* Below inc'd by ipath_snap_cntrs(), locked by ipath_eep_st_lock */
 	uint64_t ipath_traffic_wds;
 	/* active time is kept in seconds, but logged in hours */
@@ -630,6 +699,10 @@ struct ipath_devdata {
 	 * each of the counters to increment.
 	 */
 	struct ipath_eep_log_mask ipath_eep_st_masks[IPATH_EEP_LOG_CNT];
+
+	/* interrupt mitigation reload register info */
+	u16 ipath_jint_idle_ticks;	/* idle clock ticks */
+	u16 ipath_jint_max_packets;	/* max packets across all ports */
 };
 
 /* Private data for file operations */
@@ -690,7 +763,7 @@ void ipath_free_pddata(struct ipath_devdata *, struct ipath_portdata *);
 
 int ipath_parse_ushort(const char *str, unsigned short *valp);
 
-void ipath_kreceive(struct ipath_devdata *);
+void ipath_kreceive(struct ipath_portdata *);
 int ipath_setrcvhdrsize(struct ipath_devdata *, unsigned);
 int ipath_reset_device(int);
 void ipath_get_faststats(unsigned long);
@@ -698,6 +771,8 @@ int ipath_set_linkstate(struct ipath_devdata *, u8);
 int ipath_set_mtu(struct ipath_devdata *, u16);
 int ipath_set_lid(struct ipath_devdata *, u32, u8);
 int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
+void ipath_enable_armlaunch(struct ipath_devdata *);
+void ipath_disable_armlaunch(struct ipath_devdata *);
 
 /* for use in system calls, where we want to know device type, etc. */
 #define port_fp(fp) ((struct ipath_filedata *)(fp)->private_data)->pd
@@ -744,9 +819,15 @@ int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
 		 * are 64bit */
 #define IPATH_32BITCOUNTERS 0x20000
 		/* can miss port0 rx interrupts */
+		/* Interrupt register is 64 bits */
+#define IPATH_INTREG_64     0x40000
 #define IPATH_DISABLED      0x80000 /* administratively disabled */
 		/* Use GPIO interrupts for new counters */
 #define IPATH_GPIO_ERRINTRS 0x100000
+#define IPATH_SWAP_PIOBUFS  0x200000
+		/* Suppress heartbeat, even if turning off loopback */
+#define IPATH_NO_HRTBT      0x1000000
+#define IPATH_HAS_MULT_IB_SPEED 0x8000000
 
 /* Bits in GPIO for the added interrupts */
 #define IPATH_GPIO_PORT0_BIT 2
@@ -758,8 +839,6 @@ int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
 /* portdata flag bit offsets */
 		/* waiting for a packet to arrive */
 #define IPATH_PORT_WAITING_RCV   2
-		/* waiting for a PIO buffer to be available */
-#define IPATH_PORT_WAITING_PIO   3
 		/* master has not finished initializing */
 #define IPATH_PORT_MASTER_UNINIT 4
 		/* waiting for an urgent packet to arrive */
@@ -767,8 +846,6 @@ int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
 
 /* free up any allocated data at closes */
 void ipath_free_data(struct ipath_portdata *dd);
-int ipath_waitfor_mdio_cmdready(struct ipath_devdata *);
-int ipath_waitfor_complete(struct ipath_devdata *, ipath_kreg, u64, u64 *);
 u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32 *);
 void ipath_init_iba6120_funcs(struct ipath_devdata *);
 void ipath_init_iba6110_funcs(struct ipath_devdata *);
@@ -792,33 +869,6 @@ void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val);
  */
 #define IPATH_DFLT_RCVHDRSIZE 9
 
-#define IPATH_MDIO_CMD_WRITE   1
-#define IPATH_MDIO_CMD_READ    2
-#define IPATH_MDIO_CLD_DIV     25	/* to get 2.5 Mhz mdio clock */
-#define IPATH_MDIO_CMDVALID    0x40000000	/* bit 30 */
-#define IPATH_MDIO_DATAVALID   0x80000000	/* bit 31 */
-#define IPATH_MDIO_CTRL_STD    0x0
-
-static inline u64 ipath_mdio_req(int cmd, int dev, int reg, int data)
-{
-	return (((u64) IPATH_MDIO_CLD_DIV) << 32) |
-		(cmd << 26) |
-		(dev << 21) |
-		(reg << 16) |
-		(data & 0xFFFF);
-}
-
-		/* signal and fifo status, in bank 31 */
-#define IPATH_MDIO_CTRL_XGXS_REG_8  0x8
-		/* controls loopback, redundancy */
-#define IPATH_MDIO_CTRL_8355_REG_1  0x10
-		/* premph, encdec, etc. */
-#define IPATH_MDIO_CTRL_8355_REG_2  0x11
-		/* Kchars, etc. */
-#define IPATH_MDIO_CTRL_8355_REG_6  0x15
-#define IPATH_MDIO_CTRL_8355_REG_9  0x18
-#define IPATH_MDIO_CTRL_8355_REG_10 0x1D
-
 int ipath_get_user_pages(unsigned long, size_t, struct page **);
 void ipath_release_user_pages(struct page **, size_t);
 void ipath_release_user_pages_on_close(struct page **, size_t);
@@ -863,7 +913,7 @@ static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd,
 	return readl(regno + (u64 __iomem *)
 		     (dd->ipath_uregbase +
 		      (char __iomem *)dd->ipath_kregbase +
-		      dd->ipath_palign * port));
+		      dd->ipath_ureg_align * port));
 }
 
 /**
@@ -880,7 +930,7 @@ static inline void ipath_write_ureg(const struct ipath_devdata *dd,
 {
 	u64 __iomem *ubase = (u64 __iomem *)
 		(dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase +
-		 dd->ipath_palign * port);
+		 dd->ipath_ureg_align * port);
 	if (dd->ipath_kregbase)
 		writeq(value, &ubase[regno]);
 }
@@ -930,6 +980,53 @@ static inline u32 ipath_read_creg32(const struct ipath_devdata *dd,
 		      (char __iomem *)dd->ipath_kregbase));
 }
 
+static inline void ipath_write_creg(const struct ipath_devdata *dd,
+				    ipath_creg regno, u64 value)
+{
+	if (dd->ipath_kregbase)
+		writeq(value, regno + (u64 __iomem *)
+		       (dd->ipath_cregbase +
+			(char __iomem *)dd->ipath_kregbase));
+}
+
+static inline void ipath_clear_rcvhdrtail(const struct ipath_portdata *pd)
+{
+	*((u64 *) pd->port_rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 ipath_get_rcvhdrtail(const struct ipath_portdata *pd)
+{
+	return (u32) le64_to_cpu(*((volatile __le64 *)
+				pd->port_rcvhdrtail_kvaddr));
+}
+
+static inline u64 ipath_read_ireg(const struct ipath_devdata *dd, ipath_kreg r)
+{
+	return (dd->ipath_flags & IPATH_INTREG_64) ?
+		ipath_read_kreg64(dd, r) : ipath_read_kreg32(dd, r);
+}
+
+/*
+ * from contents of IBCStatus (or a saved copy), return linkstate
+ * Report ACTIVE_DEFER as ACTIVE, because we treat them the same
+ * everywhere, anyway (and should be, for almost all purposes).
+ */
+static inline u32 ipath_ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
+{
+	u32 state = (u32)(ibcs >> dd->ibcs_ls_shift) &
+		INFINIPATH_IBCS_LINKSTATE_MASK;
+	if (state == INFINIPATH_IBCS_L_STATE_ACT_DEFER)
+		state = INFINIPATH_IBCS_L_STATE_ACTIVE;
+	return state;
+}
+
+/* from contents of IBCStatus (or a saved copy), return linktrainingstate */
+static inline u32 ipath_ib_linktrstate(struct ipath_devdata *dd, u64 ibcs)
+{
+	return (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
+		dd->ibcs_lts_mask;
+}
+
 /*
  * sysfs interface.
  */

+ 2 - 3
drivers/infiniband/hw/ipath/ipath_keys.c

@@ -128,9 +128,8 @@ int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
 	int ret;
 
 	/*
-	 * We use LKEY == zero to mean a physical kmalloc() address.
-	 * This is a bit of a hack since we rely on dma_map_single()
-	 * being reversible by calling bus_to_virt().
+	 * We use LKEY == zero for kernel virtual addresses
+	 * (see ipath_get_dma_mr and ipath_dma.c).
 	 */
 	if (sge->lkey == 0) {
 		struct ipath_pd *pd = to_ipd(qp->ibqp.pd);

+ 77 - 46
drivers/infiniband/hw/ipath/ipath_mad.c

@@ -934,6 +934,7 @@ static int recv_pma_get_portsamplescontrol(struct ib_perf *pmp,
 	struct ib_pma_portsamplescontrol *p =
 		(struct ib_pma_portsamplescontrol *)pmp->data;
 	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
 	unsigned long flags;
 	u8 port_select = p->port_select;
 
@@ -955,7 +956,10 @@ static int recv_pma_get_portsamplescontrol(struct ib_perf *pmp,
 	p->counter_width = 4;	/* 32 bit counters */
 	p->counter_mask0_9 = COUNTER_MASK0_9;
 	spin_lock_irqsave(&dev->pending_lock, flags);
-	p->sample_status = dev->pma_sample_status;
+	if (crp->cr_psstat)
+		p->sample_status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		p->sample_status = dev->pma_sample_status;
 	p->sample_start = cpu_to_be32(dev->pma_sample_start);
 	p->sample_interval = cpu_to_be32(dev->pma_sample_interval);
 	p->tag = cpu_to_be16(dev->pma_tag);
@@ -975,8 +979,9 @@ static int recv_pma_set_portsamplescontrol(struct ib_perf *pmp,
 	struct ib_pma_portsamplescontrol *p =
 		(struct ib_pma_portsamplescontrol *)pmp->data;
 	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
 	unsigned long flags;
-	u32 start;
+	u8 status;
 	int ret;
 
 	if (pmp->attr_mod != 0 ||
@@ -986,59 +991,67 @@ static int recv_pma_set_portsamplescontrol(struct ib_perf *pmp,
 		goto bail;
 	}
 
-	start = be32_to_cpu(p->sample_start);
-	if (start != 0) {
-		spin_lock_irqsave(&dev->pending_lock, flags);
-		if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_DONE) {
-			dev->pma_sample_status =
-				IB_PMA_SAMPLE_STATUS_STARTED;
-			dev->pma_sample_start = start;
-			dev->pma_sample_interval =
-				be32_to_cpu(p->sample_interval);
-			dev->pma_tag = be16_to_cpu(p->tag);
-			if (p->counter_select[0])
-				dev->pma_counter_select[0] =
-					p->counter_select[0];
-			if (p->counter_select[1])
-				dev->pma_counter_select[1] =
-					p->counter_select[1];
-			if (p->counter_select[2])
-				dev->pma_counter_select[2] =
-					p->counter_select[2];
-			if (p->counter_select[3])
-				dev->pma_counter_select[3] =
-					p->counter_select[3];
-			if (p->counter_select[4])
-				dev->pma_counter_select[4] =
-					p->counter_select[4];
-		}
-		spin_unlock_irqrestore(&dev->pending_lock, flags);
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	if (crp->cr_psstat)
+		status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		status = dev->pma_sample_status;
+	if (status == IB_PMA_SAMPLE_STATUS_DONE) {
+		dev->pma_sample_start = be32_to_cpu(p->sample_start);
+		dev->pma_sample_interval = be32_to_cpu(p->sample_interval);
+		dev->pma_tag = be16_to_cpu(p->tag);
+		dev->pma_counter_select[0] = p->counter_select[0];
+		dev->pma_counter_select[1] = p->counter_select[1];
+		dev->pma_counter_select[2] = p->counter_select[2];
+		dev->pma_counter_select[3] = p->counter_select[3];
+		dev->pma_counter_select[4] = p->counter_select[4];
+		if (crp->cr_psstat) {
+			ipath_write_creg(dev->dd, crp->cr_psinterval,
+					 dev->pma_sample_interval);
+			ipath_write_creg(dev->dd, crp->cr_psstart,
+					 dev->pma_sample_start);
+		} else
+			dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED;
 	}
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
 	ret = recv_pma_get_portsamplescontrol(pmp, ibdev, port);
 
 bail:
 	return ret;
 }
 
-static u64 get_counter(struct ipath_ibdev *dev, __be16 sel)
+static u64 get_counter(struct ipath_ibdev *dev,
+		       struct ipath_cregs const *crp,
+		       __be16 sel)
 {
 	u64 ret;
 
 	switch (sel) {
 	case IB_PMA_PORT_XMIT_DATA:
-		ret = dev->ipath_sword;
+		ret = (crp->cr_psxmitdatacount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psxmitdatacount) :
+			dev->ipath_sword;
 		break;
 	case IB_PMA_PORT_RCV_DATA:
-		ret = dev->ipath_rword;
+		ret = (crp->cr_psrcvdatacount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psrcvdatacount) :
+			dev->ipath_rword;
 		break;
 	case IB_PMA_PORT_XMIT_PKTS:
-		ret = dev->ipath_spkts;
+		ret = (crp->cr_psxmitpktscount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psxmitpktscount) :
+			dev->ipath_spkts;
 		break;
 	case IB_PMA_PORT_RCV_PKTS:
-		ret = dev->ipath_rpkts;
+		ret = (crp->cr_psrcvpktscount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psrcvpktscount) :
+			dev->ipath_rpkts;
 		break;
 	case IB_PMA_PORT_XMIT_WAIT:
-		ret = dev->ipath_xmit_wait;
+		ret = (crp->cr_psxmitwaitcount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psxmitwaitcount) :
+			dev->ipath_xmit_wait;
 		break;
 	default:
 		ret = 0;
@@ -1053,14 +1066,21 @@ static int recv_pma_get_portsamplesresult(struct ib_perf *pmp,
 	struct ib_pma_portsamplesresult *p =
 		(struct ib_pma_portsamplesresult *)pmp->data;
 	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+	u8 status;
 	int i;
 
 	memset(pmp->data, 0, sizeof(pmp->data));
 	p->tag = cpu_to_be16(dev->pma_tag);
-	p->sample_status = cpu_to_be16(dev->pma_sample_status);
+	if (crp->cr_psstat)
+		status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		status = dev->pma_sample_status;
+	p->sample_status = cpu_to_be16(status);
 	for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
-		p->counter[i] = cpu_to_be32(
-			get_counter(dev, dev->pma_counter_select[i]));
+		p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
+		    cpu_to_be32(
+			get_counter(dev, crp, dev->pma_counter_select[i]));
 
 	return reply((struct ib_smp *) pmp);
 }
@@ -1071,16 +1091,23 @@ static int recv_pma_get_portsamplesresult_ext(struct ib_perf *pmp,
 	struct ib_pma_portsamplesresult_ext *p =
 		(struct ib_pma_portsamplesresult_ext *)pmp->data;
 	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+	u8 status;
 	int i;
 
 	memset(pmp->data, 0, sizeof(pmp->data));
 	p->tag = cpu_to_be16(dev->pma_tag);
-	p->sample_status = cpu_to_be16(dev->pma_sample_status);
+	if (crp->cr_psstat)
+		status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		status = dev->pma_sample_status;
+	p->sample_status = cpu_to_be16(status);
 	/* 64 bits */
 	p->extended_width = __constant_cpu_to_be32(0x80000000);
 	for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
-		p->counter[i] = cpu_to_be64(
-			get_counter(dev, dev->pma_counter_select[i]));
+		p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
+		    cpu_to_be64(
+			get_counter(dev, crp, dev->pma_counter_select[i]));
 
 	return reply((struct ib_smp *) pmp);
 }
@@ -1113,6 +1140,8 @@ static int recv_pma_get_portcounters(struct ib_perf *pmp,
 		dev->z_local_link_integrity_errors;
 	cntrs.excessive_buffer_overrun_errors -=
 		dev->z_excessive_buffer_overrun_errors;
+	cntrs.vl15_dropped -= dev->z_vl15_dropped;
+	cntrs.vl15_dropped += dev->n_vl15_dropped;
 
 	memset(pmp->data, 0, sizeof(pmp->data));
 
@@ -1156,10 +1185,10 @@ static int recv_pma_get_portcounters(struct ib_perf *pmp,
 		cntrs.excessive_buffer_overrun_errors = 0xFUL;
 	p->lli_ebor_errors = (cntrs.local_link_integrity_errors << 4) |
 		cntrs.excessive_buffer_overrun_errors;
-	if (dev->n_vl15_dropped > 0xFFFFUL)
+	if (cntrs.vl15_dropped > 0xFFFFUL)
 		p->vl15_dropped = __constant_cpu_to_be16(0xFFFF);
 	else
-		p->vl15_dropped = cpu_to_be16((u16)dev->n_vl15_dropped);
+		p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
 	if (cntrs.port_xmit_data > 0xFFFFFFFFUL)
 		p->port_xmit_data = __constant_cpu_to_be32(0xFFFFFFFF);
 	else
@@ -1262,8 +1291,10 @@ static int recv_pma_set_portcounters(struct ib_perf *pmp,
 		dev->z_excessive_buffer_overrun_errors =
 			cntrs.excessive_buffer_overrun_errors;
 
-	if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED)
+	if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) {
 		dev->n_vl15_dropped = 0;
+		dev->z_vl15_dropped = cntrs.vl15_dropped;
+	}
 
 	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA)
 		dev->z_port_xmit_data = cntrs.port_xmit_data;
@@ -1434,7 +1465,7 @@ static int process_subn(struct ib_device *ibdev, int mad_flags,
 		 * before checking for other consumers.
 		 * Just tell the caller to process it normally.
 		 */
-		ret = IB_MAD_RESULT_FAILURE;
+		ret = IB_MAD_RESULT_SUCCESS;
 		goto bail;
 	default:
 		smp->status |= IB_SMP_UNSUP_METHOD;
@@ -1516,7 +1547,7 @@ static int process_perf(struct ib_device *ibdev, u8 port_num,
 		 * before checking for other consumers.
 		 * Just tell the caller to process it normally.
 		 */
-		ret = IB_MAD_RESULT_FAILURE;
+		ret = IB_MAD_RESULT_SUCCESS;
 		goto bail;
 	default:
 		pmp->status |= IB_SMP_UNSUP_METHOD;

+ 2 - 4
drivers/infiniband/hw/ipath/ipath_qp.c

@@ -387,8 +387,8 @@ int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
 	struct ib_wc wc;
 	int ret = 0;
 
-	ipath_dbg("QP%d/%d in error state\n",
-		  qp->ibqp.qp_num, qp->remote_qpn);
+	ipath_dbg("QP%d/%d in error state (%d)\n",
+		  qp->ibqp.qp_num, qp->remote_qpn, err);
 
 	spin_lock(&dev->pending_lock);
 	/* XXX What if its already removed by the timeout code? */
@@ -855,8 +855,6 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
 	 * See ipath_mmap() for details.
 	 */
 	if (udata && udata->outlen >= sizeof(__u64)) {
-		int err;
-
 		if (!qp->r_rq.wq) {
 			__u64 offset = 0;
 

+ 7 - 11
drivers/infiniband/hw/ipath/ipath_rc.c

@@ -647,6 +647,7 @@ static void send_rc_ack(struct ipath_qp *qp)
 
 queue_ack:
 	spin_lock_irqsave(&qp->s_lock, flags);
+	dev->n_rc_qacks++;
 	qp->s_flags |= IPATH_S_ACK_PENDING;
 	qp->s_nak_state = qp->r_nak_state;
 	qp->s_ack_psn = qp->r_ack_psn;
@@ -798,11 +799,13 @@ bail:
 
 static inline void update_last_psn(struct ipath_qp *qp, u32 psn)
 {
-	if (qp->s_wait_credit) {
-		qp->s_wait_credit = 0;
-		tasklet_hi_schedule(&qp->s_task);
+	if (qp->s_last_psn != psn) {
+		qp->s_last_psn = psn;
+		if (qp->s_wait_credit) {
+			qp->s_wait_credit = 0;
+			tasklet_hi_schedule(&qp->s_task);
+		}
 	}
-	qp->s_last_psn = psn;
 }
 
 /**
@@ -1653,13 +1656,6 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 	case OP(SEND_FIRST):
 		if (!ipath_get_rwqe(qp, 0)) {
 		rnr_nak:
-			/*
-			 * A RNR NAK will ACK earlier sends and RDMA writes.
-			 * Don't queue the NAK if a RDMA read or atomic
-			 * is pending though.
-			 */
-			if (qp->r_nak_state)
-				goto done;
 			qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
 			qp->r_ack_psn = qp->r_psn;
 			goto send_ack;

+ 15 - 18
drivers/infiniband/hw/ipath/ipath_registers.h

@@ -82,8 +82,7 @@
 
 /* kr_rcvctrl bits */
 #define INFINIPATH_R_PORTENABLE_SHIFT 0
-#define INFINIPATH_R_INTRAVAIL_SHIFT 16
-#define INFINIPATH_R_TAILUPD   0x80000000
+#define INFINIPATH_R_QPMAP_ENABLE (1ULL << 38)
 
 /* kr_intstatus, kr_intclear, kr_intmask bits */
 #define INFINIPATH_I_RCVURG_SHIFT 0
@@ -272,20 +271,6 @@
 #define INFINIPATH_EXTC_LEDGBLOK_ON          0x00000002ULL
 #define INFINIPATH_EXTC_LEDGBLERR_OFF        0x00000001ULL
 
-/* kr_mdio bits */
-#define INFINIPATH_MDIO_CLKDIV_MASK 0x7FULL
-#define INFINIPATH_MDIO_CLKDIV_SHIFT 32
-#define INFINIPATH_MDIO_COMMAND_MASK 0x7ULL
-#define INFINIPATH_MDIO_COMMAND_SHIFT 26
-#define INFINIPATH_MDIO_DEVADDR_MASK 0x1FULL
-#define INFINIPATH_MDIO_DEVADDR_SHIFT 21
-#define INFINIPATH_MDIO_REGADDR_MASK 0x1FULL
-#define INFINIPATH_MDIO_REGADDR_SHIFT 16
-#define INFINIPATH_MDIO_DATA_MASK 0xFFFFULL
-#define INFINIPATH_MDIO_DATA_SHIFT 0
-#define INFINIPATH_MDIO_CMDVALID    0x0000000040000000ULL
-#define INFINIPATH_MDIO_RDDATAVALID 0x0000000080000000ULL
-
 /* kr_partitionkey bits */
 #define INFINIPATH_PKEY_SIZE 16
 #define INFINIPATH_PKEY_MASK 0xFFFF
@@ -303,8 +288,6 @@
 
 /* kr_xgxsconfig bits */
 #define INFINIPATH_XGXS_RESET          0x7ULL
-#define INFINIPATH_XGXS_MDIOADDR_MASK  0xfULL
-#define INFINIPATH_XGXS_MDIOADDR_SHIFT 4
 #define INFINIPATH_XGXS_RX_POL_SHIFT 19
 #define INFINIPATH_XGXS_RX_POL_MASK 0xfULL
 
@@ -470,6 +453,20 @@ struct ipath_cregs {
 	ipath_creg cr_unsupvlcnt;
 	ipath_creg cr_wordrcvcnt;
 	ipath_creg cr_wordsendcnt;
+	ipath_creg cr_vl15droppedpktcnt;
+	ipath_creg cr_rxotherlocalphyerrcnt;
+	ipath_creg cr_excessbufferovflcnt;
+	ipath_creg cr_locallinkintegrityerrcnt;
+	ipath_creg cr_rxvlerrcnt;
+	ipath_creg cr_rxdlidfltrcnt;
+	ipath_creg cr_psstat;
+	ipath_creg cr_psstart;
+	ipath_creg cr_psinterval;
+	ipath_creg cr_psrcvdatacount;
+	ipath_creg cr_psrcvpktscount;
+	ipath_creg cr_psxmitdatacount;
+	ipath_creg cr_psxmitpktscount;
+	ipath_creg cr_psxmitwaitcount;
 };
 
 #endif				/* _IPATH_REGISTERS_H */

+ 11 - 2
drivers/infiniband/hw/ipath/ipath_ruc.c

@@ -98,11 +98,15 @@ void ipath_insert_rnr_queue(struct ipath_qp *qp)
 		while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) {
 			qp->s_rnr_timeout -= nqp->s_rnr_timeout;
 			l = l->next;
-			if (l->next == &dev->rnrwait)
+			if (l->next == &dev->rnrwait) {
+				nqp = NULL;
 				break;
+			}
 			nqp = list_entry(l->next, struct ipath_qp,
 					 timerwait);
 		}
+		if (nqp)
+			nqp->s_rnr_timeout -= qp->s_rnr_timeout;
 		list_add(&qp->timerwait, l);
 	}
 	spin_unlock_irqrestore(&dev->pending_lock, flags);
@@ -479,9 +483,14 @@ done:
 
 static void want_buffer(struct ipath_devdata *dd)
 {
-	set_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl);
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
 			 dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
 }
 
 /**

+ 2 - 2
drivers/infiniband/hw/ipath/ipath_srq.c

@@ -94,8 +94,8 @@ bail:
 /**
  * ipath_create_srq - create a shared receive queue
  * @ibpd: the protection domain of the SRQ to create
- * @attr: the attributes of the SRQ
- * @udata: not used by the InfiniPath verbs driver
+ * @srq_init_attr: the attributes of the SRQ
+ * @udata: data from libipathverbs when creating a user SRQ
  */
 struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
 				struct ib_srq_init_attr *srq_init_attr,

+ 12 - 12
drivers/infiniband/hw/ipath/ipath_stats.c

@@ -133,15 +133,16 @@ bail:
 static void ipath_qcheck(struct ipath_devdata *dd)
 {
 	static u64 last_tot_hdrqfull;
+	struct ipath_portdata *pd = dd->ipath_pd[0];
 	size_t blen = 0;
 	char buf[128];
 
 	*buf = 0;
-	if (dd->ipath_pd[0]->port_hdrqfull != dd->ipath_p0_hdrqfull) {
+	if (pd->port_hdrqfull != dd->ipath_p0_hdrqfull) {
 		blen = snprintf(buf, sizeof buf, "port 0 hdrqfull %u",
-				dd->ipath_pd[0]->port_hdrqfull -
+				pd->port_hdrqfull -
 				dd->ipath_p0_hdrqfull);
-		dd->ipath_p0_hdrqfull = dd->ipath_pd[0]->port_hdrqfull;
+		dd->ipath_p0_hdrqfull = pd->port_hdrqfull;
 	}
 	if (ipath_stats.sps_etidfull != dd->ipath_last_tidfull) {
 		blen += snprintf(buf + blen, sizeof buf - blen,
@@ -173,7 +174,7 @@ static void ipath_qcheck(struct ipath_devdata *dd)
 	if (blen)
 		ipath_dbg("%s\n", buf);
 
-	if (dd->ipath_port0head != (u32)
+	if (pd->port_head != (u32)
 	    le64_to_cpu(*dd->ipath_hdrqtailptr)) {
 		if (dd->ipath_lastport0rcv_cnt ==
 		    ipath_stats.sps_port0pkts) {
@@ -181,7 +182,7 @@ static void ipath_qcheck(struct ipath_devdata *dd)
 				   "port0 hd=%llx tl=%x; port0pkts %llx\n",
 				   (unsigned long long)
 				   le64_to_cpu(*dd->ipath_hdrqtailptr),
-				   dd->ipath_port0head,
+				   pd->port_head,
 				   (unsigned long long)
 				   ipath_stats.sps_port0pkts);
 		}
@@ -237,7 +238,7 @@ static void ipath_chk_errormask(struct ipath_devdata *dd)
 void ipath_get_faststats(unsigned long opaque)
 {
 	struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
-	u32 val;
+	int i;
 	static unsigned cnt;
 	unsigned long flags;
 	u64 traffic_wds;
@@ -321,12 +322,11 @@ void ipath_get_faststats(unsigned long opaque)
 
 	/* limit qfull messages to ~one per minute per port */
 	if ((++cnt & 0x10)) {
-		for (val = dd->ipath_cfgports - 1; ((int)val) >= 0;
-		     val--) {
-			if (dd->ipath_lastegrheads[val] != -1)
-				dd->ipath_lastegrheads[val] = -1;
-			if (dd->ipath_lastrcvhdrqtails[val] != -1)
-				dd->ipath_lastrcvhdrqtails[val] = -1;
+		for (i = (int) dd->ipath_cfgports; --i >= 0; ) {
+			struct ipath_portdata *pd = dd->ipath_pd[i];
+
+			if (pd && pd->port_lastrcvhdrqtail != -1)
+				pd->port_lastrcvhdrqtail = -1;
 		}
 	}
 

+ 364 - 0
drivers/infiniband/hw/ipath/ipath_sysfs.c

@@ -363,6 +363,60 @@ static ssize_t show_unit(struct device *dev,
 	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_unit);
 }
 
+static ssize_t show_jint_max_packets(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_max_packets);
+}
+
+static ssize_t store_jint_max_packets(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf,
+				      size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 v = 0;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &v);
+	if (ret < 0)
+		ipath_dev_err(dd, "invalid jint_max_packets.\n");
+	else
+		dd->ipath_f_config_jint(dd, dd->ipath_jint_idle_ticks, v);
+
+	return ret;
+}
+
+static ssize_t show_jint_idle_ticks(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_idle_ticks);
+}
+
+static ssize_t store_jint_idle_ticks(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf,
+				     size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 v = 0;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &v);
+	if (ret < 0)
+		ipath_dev_err(dd, "invalid jint_idle_ticks.\n");
+	else
+		dd->ipath_f_config_jint(dd, v, dd->ipath_jint_max_packets);
+
+	return ret;
+}
+
 #define DEVICE_COUNTER(name, attr) \
 	static ssize_t show_counter_##name(struct device *dev, \
 					   struct device_attribute *attr, \
@@ -670,6 +724,257 @@ static ssize_t show_logged_errs(struct device *dev,
 	return count;
 }
 
+/*
+ * New sysfs entries to control various IB config. These all turn into
+ * accesses via ipath_f_get/set_ib_cfg.
+ *
+ * Get/Set heartbeat enable. Or of 1=enabled, 2=auto
+ */
+static ssize_t show_hrtbt_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_HRTBT);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_hrtbt_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && val > 3)
+		ret = -EINVAL;
+	if (ret < 0) {
+		ipath_dev_err(dd, "attempt to set invalid Heartbeat enable\n");
+		goto bail;
+	}
+
+	/*
+	 * Set the "intentional" heartbeat enable per either of
+	 * "Enable" and "Auto", as these are normally set together.
+	 * This bit is consulted when leaving loopback mode,
+	 * because entering loopback mode overrides it and automatically
+	 * disables heartbeat.
+	 */
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, val);
+	if (r < 0)
+		ret = r;
+	else if (val == IPATH_IB_HRTBT_OFF)
+		dd->ipath_flags |= IPATH_NO_HRTBT;
+	else
+		dd->ipath_flags &= ~IPATH_NO_HRTBT;
+
+bail:
+	return ret;
+}
+
+/*
+ * Get/Set Link-widths enabled. Or of 1=1x, 2=4x (this is human/IB centric,
+ * _not_ the particular encoding of any given chip)
+ */
+static ssize_t show_lwid_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_lwid_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && (val == 0 || val > 3))
+		ret = -EINVAL;
+	if (ret < 0) {
+		ipath_dev_err(dd,
+			"attempt to set invalid Link Width (enable)\n");
+		goto bail;
+	}
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, val);
+	if (r < 0)
+		ret = r;
+
+bail:
+	return ret;
+}
+
+/* Get current link width */
+static ssize_t show_lwid(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+/*
+ * Get/Set Link-speeds enabled. Or of 1=SDR 2=DDR.
+ */
+static ssize_t show_spd_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_spd_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && (val == 0 || val > (IPATH_IB_SDR | IPATH_IB_DDR)))
+		ret = -EINVAL;
+	if (ret < 0) {
+		ipath_dev_err(dd,
+			"attempt to set invalid Link Speed (enable)\n");
+		goto bail;
+	}
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, val);
+	if (r < 0)
+		ret = r;
+
+bail:
+	return ret;
+}
+
+/* Get current link speed */
+static ssize_t show_spd(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+/*
+ * Get/Set RX polarity-invert enable. 0=no, 1=yes.
+ */
+static ssize_t show_rx_polinv_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_rx_polinv_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret < 0 || val > 1)
+		goto invalid;
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB, val);
+	if (r < 0) {
+		ret = r;
+		goto bail;
+	}
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid Rx Polarity (enable)\n");
+bail:
+	return ret;
+}
+/*
+ * Get/Set RX lane-reversal enable. 0=no, 1=yes.
+ */
+static ssize_t show_lanerev_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_lanerev_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && val > 1) {
+		ret = -EINVAL;
+		ipath_dev_err(dd,
+			"attempt to set invalid Lane reversal (enable)\n");
+		goto bail;
+	}
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB, val);
+	if (r < 0)
+		ret = r;
+
+bail:
+	return ret;
+}
+
 static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL);
 static DRIVER_ATTR(version, S_IRUGO, show_version, NULL);
 
@@ -706,6 +1011,10 @@ static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL);
 static DEVICE_ATTR(rx_pol_inv, S_IWUSR, NULL, store_rx_pol_inv);
 static DEVICE_ATTR(led_override, S_IWUSR, NULL, store_led_override);
 static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL);
+static DEVICE_ATTR(jint_max_packets, S_IWUSR | S_IRUGO,
+		   show_jint_max_packets, store_jint_max_packets);
+static DEVICE_ATTR(jint_idle_ticks, S_IWUSR | S_IRUGO,
+		   show_jint_idle_ticks, store_jint_idle_ticks);
 
 static struct attribute *dev_attributes[] = {
 	&dev_attr_guid.attr,
@@ -732,6 +1041,34 @@ static struct attribute_group dev_attr_group = {
 	.attrs = dev_attributes
 };
 
+static DEVICE_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb,
+		   store_hrtbt_enb);
+static DEVICE_ATTR(link_width_enable, S_IWUSR | S_IRUGO, show_lwid_enb,
+		   store_lwid_enb);
+static DEVICE_ATTR(link_width, S_IRUGO, show_lwid, NULL);
+static DEVICE_ATTR(link_speed_enable, S_IWUSR | S_IRUGO, show_spd_enb,
+		   store_spd_enb);
+static DEVICE_ATTR(link_speed, S_IRUGO, show_spd, NULL);
+static DEVICE_ATTR(rx_pol_inv_enable, S_IWUSR | S_IRUGO, show_rx_polinv_enb,
+		   store_rx_polinv_enb);
+static DEVICE_ATTR(rx_lane_rev_enable, S_IWUSR | S_IRUGO, show_lanerev_enb,
+		   store_lanerev_enb);
+
+static struct attribute *dev_ibcfg_attributes[] = {
+	&dev_attr_hrtbt_enable.attr,
+	&dev_attr_link_width_enable.attr,
+	&dev_attr_link_width.attr,
+	&dev_attr_link_speed_enable.attr,
+	&dev_attr_link_speed.attr,
+	&dev_attr_rx_pol_inv_enable.attr,
+	&dev_attr_rx_lane_rev_enable.attr,
+	NULL
+};
+
+static struct attribute_group dev_ibcfg_attr_group = {
+	.attrs = dev_ibcfg_attributes
+};
+
 /**
  * ipath_expose_reset - create a device reset file
  * @dev: the device structure
@@ -770,6 +1107,26 @@ int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd)
 	if (ret)
 		goto bail_attrs;
 
+	if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
+		ret = device_create_file(dev, &dev_attr_jint_idle_ticks);
+		if (ret)
+			goto bail_counter;
+		ret = device_create_file(dev, &dev_attr_jint_max_packets);
+		if (ret)
+			goto bail_idle;
+
+		ret = sysfs_create_group(&dev->kobj, &dev_ibcfg_attr_group);
+		if (ret)
+			goto bail_max;
+	}
+
+	return 0;
+
+bail_max:
+	device_remove_file(dev, &dev_attr_jint_max_packets);
+bail_idle:
+	device_remove_file(dev, &dev_attr_jint_idle_ticks);
+bail_counter:
 	sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
 bail_attrs:
 	sysfs_remove_group(&dev->kobj, &dev_attr_group);
@@ -780,6 +1137,13 @@ bail:
 void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd)
 {
 	sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
+
+	if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
+		sysfs_remove_group(&dev->kobj, &dev_ibcfg_attr_group);
+		device_remove_file(dev, &dev_attr_jint_idle_ticks);
+		device_remove_file(dev, &dev_attr_jint_max_packets);
+	}
+
 	sysfs_remove_group(&dev->kobj, &dev_attr_group);
 
 	device_remove_file(dev, &dev_attr_reset);

+ 1 - 2
drivers/infiniband/hw/ipath/ipath_ud.c

@@ -301,8 +301,6 @@ int ipath_make_ud_req(struct ipath_qp *qp)
 
 	/* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
 	qp->s_hdrwords = 7;
-	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
-		qp->s_hdrwords++;
 	qp->s_cur_size = wqe->length;
 	qp->s_cur_sge = &qp->s_sge;
 	qp->s_wqe = wqe;
@@ -327,6 +325,7 @@ int ipath_make_ud_req(struct ipath_qp *qp)
 		ohdr = &qp->s_hdr.u.oth;
 	}
 	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		qp->s_hdrwords++;
 		ohdr->u.ud.imm_data = wqe->wr.imm_data;
 		bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
 	} else

+ 37 - 18
drivers/infiniband/hw/ipath/ipath_verbs.c

@@ -943,7 +943,7 @@ bail:
  * ipath_verbs_send - send a packet
  * @qp: the QP to send on
  * @hdr: the packet header
- * @hdrwords: the number of words in the header
+ * @hdrwords: the number of 32-bit words in the header
  * @ss: the SGE to send
  * @len: the length of the packet in bytes
  */
@@ -955,7 +955,10 @@ int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
 	int ret;
 	u32 dwords = (len + 3) >> 2;
 
-	/* +1 is for the qword padding of pbc */
+	/*
+	 * Calculate the send buffer trigger address.
+	 * The +1 counts for the pbc control dword following the pbc length.
+	 */
 	plen = hdrwords + dwords + 1;
 
 	/* Drop non-VL15 packets if we are not in the active state */
@@ -1130,20 +1133,34 @@ static int ipath_query_device(struct ib_device *ibdev,
 	return 0;
 }
 
-const u8 ipath_cvt_physportstate[16] = {
-	[INFINIPATH_IBCS_LT_STATE_DISABLED] = 3,
-	[INFINIPATH_IBCS_LT_STATE_LINKUP] = 5,
-	[INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = 2,
-	[INFINIPATH_IBCS_LT_STATE_POLLQUIET] = 2,
-	[INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = 1,
-	[INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = 1,
-	[INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] = 4,
-	[INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] = 4,
-	[INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] = 4,
-	[INFINIPATH_IBCS_LT_STATE_CFGIDLE] = 4,
-	[INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] = 6,
-	[INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] = 6,
-	[INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] = 6,
+const u8 ipath_cvt_physportstate[32] = {
+	[INFINIPATH_IBCS_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
+	[INFINIPATH_IBCS_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
+	[INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
+	[INFINIPATH_IBCS_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
+	[INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
+	[INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
+	[INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
 };
 
 u32 ipath_get_cr_errpkey(struct ipath_devdata *dd)
@@ -1168,8 +1185,9 @@ static int ipath_query_port(struct ib_device *ibdev,
 	ibcstat = dd->ipath_lastibcstat;
 	props->state = ((ibcstat >> 4) & 0x3) + 1;
 	/* See phys_state_show() */
-	props->phys_state = ipath_cvt_physportstate[
-		dd->ipath_lastibcstat & 0xf];
+	props->phys_state = /* MEA: assumes shift == 0 */
+		ipath_cvt_physportstate[dd->ipath_lastibcstat &
+		dd->ibcs_lts_mask];
 	props->port_cap_flags = dev->port_cap_flags;
 	props->gid_tbl_len = 1;
 	props->max_msg_sz = 0x80000000;
@@ -1641,6 +1659,7 @@ int ipath_register_ib_device(struct ipath_devdata *dd)
 		cntrs.local_link_integrity_errors;
 	idev->z_excessive_buffer_overrun_errors =
 		cntrs.excessive_buffer_overrun_errors;
+	idev->z_vl15_dropped = cntrs.vl15_dropped;
 
 	/*
 	 * The system image GUID is supposed to be the same for all

+ 12 - 0
drivers/infiniband/hw/ipath/ipath_verbs.h

@@ -554,6 +554,7 @@ struct ipath_ibdev {
 	u32 z_pkey_violations;			/* starting count for PMA */
 	u32 z_local_link_integrity_errors;	/* starting count for PMA */
 	u32 z_excessive_buffer_overrun_errors;	/* starting count for PMA */
+	u32 z_vl15_dropped;			/* starting count for PMA */
 	u32 n_rc_resends;
 	u32 n_rc_acks;
 	u32 n_rc_qacks;
@@ -598,6 +599,7 @@ struct ipath_verbs_counters {
 	u64 port_rcv_packets;
 	u32 local_link_integrity_errors;
 	u32 excessive_buffer_overrun_errors;
+	u32 vl15_dropped;
 };
 
 static inline struct ipath_mr *to_imr(struct ib_mr *ibmr)
@@ -830,7 +832,17 @@ unsigned ipath_get_pkey(struct ipath_devdata *, unsigned);
 
 extern const enum ib_wc_opcode ib_ipath_wc_opcode[];
 
+/*
+ * Below converts HCA-specific LinkTrainingState to IB PhysPortState
+ * values.
+ */
 extern const u8 ipath_cvt_physportstate[];
+#define IB_PHYSPORTSTATE_SLEEP 1
+#define IB_PHYSPORTSTATE_POLL 2
+#define IB_PHYSPORTSTATE_DISABLED 3
+#define IB_PHYSPORTSTATE_CFG_TRAIN 4
+#define IB_PHYSPORTSTATE_LINKUP 5
+#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6
 
 extern const int ib_ipath_state_ops[];
 

+ 5 - 4
drivers/infiniband/hw/mlx4/cq.c

@@ -313,6 +313,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
 	struct mlx4_ib_srq *srq;
 	int is_send;
 	int is_error;
+	u32 g_mlpath_rqpn;
 	u16 wqe_ctr;
 
 	cqe = next_cqe_sw(cq);
@@ -426,10 +427,10 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
 
 		wc->slid	   = be16_to_cpu(cqe->rlid);
 		wc->sl		   = cqe->sl >> 4;
-		wc->src_qp	   = be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff;
-		wc->dlid_path_bits = (be32_to_cpu(cqe->g_mlpath_rqpn) >> 24) & 0x7f;
-		wc->wc_flags      |= be32_to_cpu(cqe->g_mlpath_rqpn) & 0x80000000 ?
-			IB_WC_GRH : 0;
+		g_mlpath_rqpn	   = be32_to_cpu(cqe->g_mlpath_rqpn);
+		wc->src_qp	   = g_mlpath_rqpn & 0xffffff;
+		wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
+		wc->wc_flags	  |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0;
 		wc->pkey_index     = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f;
 	}
 

+ 6 - 7
drivers/infiniband/hw/mthca/mthca_dev.h

@@ -60,13 +60,12 @@
 enum {
 	MTHCA_FLAG_DDR_HIDDEN = 1 << 1,
 	MTHCA_FLAG_SRQ        = 1 << 2,
-	MTHCA_FLAG_MSI        = 1 << 3,
-	MTHCA_FLAG_MSI_X      = 1 << 4,
-	MTHCA_FLAG_NO_LAM     = 1 << 5,
-	MTHCA_FLAG_FMR        = 1 << 6,
-	MTHCA_FLAG_MEMFREE    = 1 << 7,
-	MTHCA_FLAG_PCIE       = 1 << 8,
-	MTHCA_FLAG_SINAI_OPT  = 1 << 9
+	MTHCA_FLAG_MSI_X      = 1 << 3,
+	MTHCA_FLAG_NO_LAM     = 1 << 4,
+	MTHCA_FLAG_FMR        = 1 << 5,
+	MTHCA_FLAG_MEMFREE    = 1 << 6,
+	MTHCA_FLAG_PCIE       = 1 << 7,
+	MTHCA_FLAG_SINAI_OPT  = 1 << 8
 };
 
 enum {

+ 2 - 4
drivers/infiniband/hw/mthca/mthca_eq.c

@@ -827,8 +827,7 @@ int mthca_init_eq_table(struct mthca_dev *dev)
 	if (err)
 		goto err_out_free;
 
-	if (dev->mthca_flags & MTHCA_FLAG_MSI ||
-	    dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
 		dev->eq_table.clr_mask = 0;
 	} else {
 		dev->eq_table.clr_mask =
@@ -839,8 +838,7 @@ int mthca_init_eq_table(struct mthca_dev *dev)
 
 	dev->eq_table.arm_mask = 0;
 
-	intr = (dev->mthca_flags & MTHCA_FLAG_MSI) ?
-		128 : dev->eq_table.inta_pin;
+	intr = dev->eq_table.inta_pin;
 
 	err = mthca_create_eq(dev, dev->limits.num_cqs + MTHCA_NUM_SPARE_EQE,
 			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 128 : intr,

+ 6 - 34
drivers/infiniband/hw/mthca/mthca_main.c

@@ -65,14 +65,9 @@ static int msi_x = 1;
 module_param(msi_x, int, 0444);
 MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
 
-static int msi = 0;
-module_param(msi, int, 0444);
-MODULE_PARM_DESC(msi, "attempt to use MSI if nonzero (deprecated, use MSI-X instead)");
-
 #else /* CONFIG_PCI_MSI */
 
 #define msi_x (0)
-#define msi   (0)
 
 #endif /* CONFIG_PCI_MSI */
 
@@ -816,13 +811,11 @@ static int mthca_setup_hca(struct mthca_dev *dev)
 
 	err = mthca_NOP(dev, &status);
 	if (err || status) {
-		if (dev->mthca_flags & (MTHCA_FLAG_MSI | MTHCA_FLAG_MSI_X)) {
+		if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
 			mthca_warn(dev, "NOP command failed to generate interrupt "
 				   "(IRQ %d).\n",
-				   dev->mthca_flags & MTHCA_FLAG_MSI_X ?
-				   dev->eq_table.eq[MTHCA_EQ_CMD].msi_x_vector :
-				   dev->pdev->irq);
-			mthca_warn(dev, "Trying again with MSI/MSI-X disabled.\n");
+				   dev->eq_table.eq[MTHCA_EQ_CMD].msi_x_vector);
+			mthca_warn(dev, "Trying again with MSI-X disabled.\n");
 		} else {
 			mthca_err(dev, "NOP command failed to generate interrupt "
 				  "(IRQ %d), aborting.\n",
@@ -1005,7 +998,7 @@ static struct {
 			   .flags     = 0 },
 	[ARBEL_COMPAT] = { .latest_fw = MTHCA_FW_VER(4, 8, 200),
 			   .flags     = MTHCA_FLAG_PCIE },
-	[ARBEL_NATIVE] = { .latest_fw = MTHCA_FW_VER(5, 2, 0),
+	[ARBEL_NATIVE] = { .latest_fw = MTHCA_FW_VER(5, 3, 0),
 			   .flags     = MTHCA_FLAG_MEMFREE |
 					MTHCA_FLAG_PCIE },
 	[SINAI]        = { .latest_fw = MTHCA_FW_VER(1, 2, 0),
@@ -1128,29 +1121,12 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
 
 	if (msi_x && !mthca_enable_msi_x(mdev))
 		mdev->mthca_flags |= MTHCA_FLAG_MSI_X;
-	else if (msi) {
-		static int warned;
-
-		if (!warned) {
-			printk(KERN_WARNING PFX "WARNING: MSI support will be "
-			       "removed from the ib_mthca driver in January 2008.\n");
-			printk(KERN_WARNING "    If you are using MSI and cannot "
-			       "switch to MSI-X, please tell "
-			       "<general@lists.openfabrics.org>.\n");
-			++warned;
-		}
-
-		if (!pci_enable_msi(pdev))
-			mdev->mthca_flags |= MTHCA_FLAG_MSI;
-	}
 
 	err = mthca_setup_hca(mdev);
-	if (err == -EBUSY && (mdev->mthca_flags & (MTHCA_FLAG_MSI | MTHCA_FLAG_MSI_X))) {
+	if (err == -EBUSY && (mdev->mthca_flags & MTHCA_FLAG_MSI_X)) {
 		if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
 			pci_disable_msix(pdev);
-		if (mdev->mthca_flags & MTHCA_FLAG_MSI)
-			pci_disable_msi(pdev);
-		mdev->mthca_flags &= ~(MTHCA_FLAG_MSI_X | MTHCA_FLAG_MSI);
+		mdev->mthca_flags &= ~MTHCA_FLAG_MSI_X;
 
 		err = mthca_setup_hca(mdev);
 	}
@@ -1192,8 +1168,6 @@ err_cleanup:
 err_close:
 	if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
 		pci_disable_msix(pdev);
-	if (mdev->mthca_flags & MTHCA_FLAG_MSI)
-		pci_disable_msi(pdev);
 
 	mthca_close_hca(mdev);
 
@@ -1246,8 +1220,6 @@ static void __mthca_remove_one(struct pci_dev *pdev)
 
 		if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
 			pci_disable_msix(pdev);
-		if (mdev->mthca_flags & MTHCA_FLAG_MSI)
-			pci_disable_msi(pdev);
 
 		ib_dealloc_device(&mdev->ib_dev);
 		mthca_release_regions(pdev, mdev->mthca_flags &

+ 108 - 76
drivers/infiniband/ulp/ipoib/ipoib.h

@@ -56,42 +56,43 @@
 /* constants */
 
 enum {
-	IPOIB_PACKET_SIZE         = 2048,
-	IPOIB_BUF_SIZE 		  = IPOIB_PACKET_SIZE + IB_GRH_BYTES,
+	IPOIB_PACKET_SIZE	  = 2048,
+	IPOIB_BUF_SIZE		  = IPOIB_PACKET_SIZE + IB_GRH_BYTES,
 
-	IPOIB_ENCAP_LEN 	  = 4,
+	IPOIB_ENCAP_LEN		  = 4,
 
-	IPOIB_CM_MTU              = 0x10000 - 0x10, /* padding to align header to 16 */
-	IPOIB_CM_BUF_SIZE         = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
-	IPOIB_CM_HEAD_SIZE 	  = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
-	IPOIB_CM_RX_SG            = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
-	IPOIB_RX_RING_SIZE 	  = 128,
-	IPOIB_TX_RING_SIZE 	  = 64,
+	IPOIB_CM_MTU		  = 0x10000 - 0x10, /* padding to align header to 16 */
+	IPOIB_CM_BUF_SIZE	  = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
+	IPOIB_CM_HEAD_SIZE	  = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
+	IPOIB_CM_RX_SG		  = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
+	IPOIB_RX_RING_SIZE	  = 128,
+	IPOIB_TX_RING_SIZE	  = 64,
 	IPOIB_MAX_QUEUE_SIZE	  = 8192,
 	IPOIB_MIN_QUEUE_SIZE	  = 2,
+	IPOIB_CM_MAX_CONN_QP	  = 4096,
 
-	IPOIB_NUM_WC 		  = 4,
+	IPOIB_NUM_WC		  = 4,
 
 	IPOIB_MAX_PATH_REC_QUEUE  = 3,
-	IPOIB_MAX_MCAST_QUEUE     = 3,
-
-	IPOIB_FLAG_OPER_UP 	  = 0,
-	IPOIB_FLAG_INITIALIZED    = 1,
-	IPOIB_FLAG_ADMIN_UP 	  = 2,
-	IPOIB_PKEY_ASSIGNED 	  = 3,
-	IPOIB_PKEY_STOP 	  = 4,
-	IPOIB_FLAG_SUBINTERFACE   = 5,
-	IPOIB_MCAST_RUN 	  = 6,
-	IPOIB_STOP_REAPER         = 7,
-	IPOIB_MCAST_STARTED       = 8,
-	IPOIB_FLAG_ADMIN_CM 	  = 9,
+	IPOIB_MAX_MCAST_QUEUE	  = 3,
+
+	IPOIB_FLAG_OPER_UP	  = 0,
+	IPOIB_FLAG_INITIALIZED	  = 1,
+	IPOIB_FLAG_ADMIN_UP	  = 2,
+	IPOIB_PKEY_ASSIGNED	  = 3,
+	IPOIB_PKEY_STOP		  = 4,
+	IPOIB_FLAG_SUBINTERFACE	  = 5,
+	IPOIB_MCAST_RUN		  = 6,
+	IPOIB_STOP_REAPER	  = 7,
+	IPOIB_MCAST_STARTED	  = 8,
+	IPOIB_FLAG_ADMIN_CM	  = 9,
 	IPOIB_FLAG_UMCAST	  = 10,
 
 	IPOIB_MAX_BACKOFF_SECONDS = 16,
 
-	IPOIB_MCAST_FLAG_FOUND 	  = 0,	/* used in set_multicast_list */
+	IPOIB_MCAST_FLAG_FOUND	  = 0,	/* used in set_multicast_list */
 	IPOIB_MCAST_FLAG_SENDONLY = 1,
-	IPOIB_MCAST_FLAG_BUSY 	  = 2,	/* joining or already joined */
+	IPOIB_MCAST_FLAG_BUSY	  = 2,	/* joining or already joined */
 	IPOIB_MCAST_FLAG_ATTACHED = 3,
 };
 
@@ -117,7 +118,7 @@ struct ipoib_pseudoheader {
 struct ipoib_mcast {
 	struct ib_sa_mcmember_rec mcmember;
 	struct ib_sa_multicast	 *mc;
-	struct ipoib_ah          *ah;
+	struct ipoib_ah		 *ah;
 
 	struct rb_node    rb_node;
 	struct list_head  list;
@@ -186,27 +187,29 @@ enum ipoib_cm_state {
 };
 
 struct ipoib_cm_rx {
-	struct ib_cm_id     *id;
-	struct ib_qp        *qp;
-	struct list_head     list;
-	struct net_device   *dev;
-	unsigned long        jiffies;
-	enum ipoib_cm_state  state;
+	struct ib_cm_id	       *id;
+	struct ib_qp	       *qp;
+	struct ipoib_cm_rx_buf *rx_ring;
+	struct list_head	list;
+	struct net_device      *dev;
+	unsigned long		jiffies;
+	enum ipoib_cm_state	state;
+	int			recv_count;
 };
 
 struct ipoib_cm_tx {
-	struct ib_cm_id     *id;
-	struct ib_qp        *qp;
+	struct ib_cm_id	    *id;
+	struct ib_qp	    *qp;
 	struct list_head     list;
 	struct net_device   *dev;
 	struct ipoib_neigh  *neigh;
 	struct ipoib_path   *path;
 	struct ipoib_tx_buf *tx_ring;
-	unsigned             tx_head;
-	unsigned             tx_tail;
-	unsigned long        flags;
-	u32                  mtu;
-	struct ib_wc         ibwc[IPOIB_NUM_WC];
+	unsigned	     tx_head;
+	unsigned	     tx_tail;
+	unsigned long	     flags;
+	u32		     mtu;
+	struct ib_wc	     ibwc[IPOIB_NUM_WC];
 };
 
 struct ipoib_cm_rx_buf {
@@ -215,25 +218,28 @@ struct ipoib_cm_rx_buf {
 };
 
 struct ipoib_cm_dev_priv {
-	struct ib_srq  	       *srq;
+	struct ib_srq	       *srq;
 	struct ipoib_cm_rx_buf *srq_ring;
-	struct ib_cm_id        *id;
-	struct list_head        passive_ids;   /* state: LIVE */
-	struct list_head        rx_error_list; /* state: ERROR */
-	struct list_head        rx_flush_list; /* state: FLUSH, drain not started */
-	struct list_head        rx_drain_list; /* state: FLUSH, drain started */
-	struct list_head        rx_reap_list;  /* state: FLUSH, drain done */
+	struct ib_cm_id	       *id;
+	struct list_head	passive_ids;   /* state: LIVE */
+	struct list_head	rx_error_list; /* state: ERROR */
+	struct list_head	rx_flush_list; /* state: FLUSH, drain not started */
+	struct list_head	rx_drain_list; /* state: FLUSH, drain started */
+	struct list_head	rx_reap_list;  /* state: FLUSH, drain done */
 	struct work_struct      start_task;
 	struct work_struct      reap_task;
 	struct work_struct      skb_task;
 	struct work_struct      rx_reap_task;
 	struct delayed_work     stale_task;
 	struct sk_buff_head     skb_queue;
-	struct list_head        start_list;
-	struct list_head        reap_list;
-	struct ib_wc            ibwc[IPOIB_NUM_WC];
-	struct ib_sge           rx_sge[IPOIB_CM_RX_SG];
+	struct list_head	start_list;
+	struct list_head	reap_list;
+	struct ib_wc		ibwc[IPOIB_NUM_WC];
+	struct ib_sge		rx_sge[IPOIB_CM_RX_SG];
 	struct ib_recv_wr       rx_wr;
+	int			nonsrq_conn_qp;
+	int			max_cm_mtu;
+	int			num_frags;
 };
 
 /*
@@ -269,30 +275,30 @@ struct ipoib_dev_priv {
 	struct work_struct pkey_event_task;
 
 	struct ib_device *ca;
-	u8            	  port;
-	u16           	  pkey;
-	u16               pkey_index;
-	struct ib_pd  	 *pd;
-	struct ib_mr  	 *mr;
-	struct ib_cq  	 *cq;
-	struct ib_qp  	 *qp;
-	u32           	  qkey;
+	u8		  port;
+	u16		  pkey;
+	u16		  pkey_index;
+	struct ib_pd	 *pd;
+	struct ib_mr	 *mr;
+	struct ib_cq	 *cq;
+	struct ib_qp	 *qp;
+	u32		  qkey;
 
 	union ib_gid local_gid;
-	u16          local_lid;
+	u16	     local_lid;
 
 	unsigned int admin_mtu;
 	unsigned int mcast_mtu;
 
 	struct ipoib_rx_buf *rx_ring;
 
-	spinlock_t           tx_lock;
+	spinlock_t	     tx_lock;
 	struct ipoib_tx_buf *tx_ring;
-	unsigned             tx_head;
-	unsigned             tx_tail;
-	struct ib_sge        tx_sge;
+	unsigned	     tx_head;
+	unsigned	     tx_tail;
+	struct ib_sge	     tx_sge;
 	struct ib_send_wr    tx_wr;
-	unsigned             tx_outstanding;
+	unsigned	     tx_outstanding;
 
 	struct ib_wc ibwc[IPOIB_NUM_WC];
 
@@ -317,10 +323,10 @@ struct ipoib_dev_priv {
 
 struct ipoib_ah {
 	struct net_device *dev;
-	struct ib_ah      *ah;
+	struct ib_ah	  *ah;
 	struct list_head   list;
-	struct kref        ref;
-	unsigned           last_send;
+	struct kref	   ref;
+	unsigned	   last_send;
 };
 
 struct ipoib_path {
@@ -331,11 +337,11 @@ struct ipoib_path {
 
 	struct list_head      neigh_list;
 
-	int                   query_id;
+	int		      query_id;
 	struct ib_sa_query   *query;
 	struct completion     done;
 
-	struct rb_node        rb_node;
+	struct rb_node	      rb_node;
 	struct list_head      list;
 };
 
@@ -344,7 +350,7 @@ struct ipoib_neigh {
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 	struct ipoib_cm_tx *cm;
 #endif
-	union ib_gid        dgid;
+	union ib_gid	    dgid;
 	struct sk_buff_head queue;
 
 	struct neighbour   *neighbour;
@@ -455,12 +461,14 @@ void ipoib_drain_cq(struct net_device *dev);
 
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
 
-#define IPOIB_FLAGS_RC          0x80
-#define IPOIB_FLAGS_UC          0x40
+#define IPOIB_FLAGS_RC		0x80
+#define IPOIB_FLAGS_UC		0x40
 
 /* We don't support UC connections at the moment */
 #define IPOIB_CM_SUPPORTED(ha)   (ha[0] & (IPOIB_FLAGS_RC))
 
+extern int ipoib_max_conn_qp;
+
 static inline int ipoib_cm_admin_enabled(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -491,6 +499,18 @@ static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *t
 	neigh->cm = tx;
 }
 
+static inline int ipoib_cm_has_srq(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	return !!priv->cm.srq;
+}
+
+static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	return priv->cm.max_cm_mtu;
+}
+
 void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx);
 int ipoib_cm_dev_open(struct net_device *dev);
 void ipoib_cm_dev_stop(struct net_device *dev);
@@ -500,7 +520,7 @@ void ipoib_cm_dev_cleanup(struct net_device *dev);
 struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
 				    struct ipoib_neigh *neigh);
 void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
-void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
+void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
 			   unsigned int mtu);
 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
 void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc);
@@ -508,6 +528,8 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc);
 
 struct ipoib_cm_tx;
 
+#define ipoib_max_conn_qp 0
+
 static inline int ipoib_cm_admin_enabled(struct net_device *dev)
 {
 	return 0;
@@ -533,6 +555,16 @@ static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *t
 {
 }
 
+static inline int ipoib_cm_has_srq(struct net_device *dev)
+{
+	return 0;
+}
+
+static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev)
+{
+	return 0;
+}
+
 static inline
 void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
 {
@@ -582,7 +614,7 @@ int ipoib_cm_add_mode_attr(struct net_device *dev)
 	return 0;
 }
 
-static inline void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
+static inline void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
 					 unsigned int mtu)
 {
 	dev_kfree_skb_any(skb);
@@ -624,12 +656,12 @@ extern struct ib_sa_client ipoib_sa_client;
 extern int ipoib_debug_level;
 
 #define ipoib_dbg(priv, format, arg...)			\
-	do {					        \
+	do {						\
 		if (ipoib_debug_level > 0)			\
 			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
 	} while (0)
 #define ipoib_dbg_mcast(priv, format, arg...)		\
-	do {					        \
+	do {						\
 		if (mcast_debug_level > 0)		\
 			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
 	} while (0)
@@ -642,7 +674,7 @@ extern int ipoib_debug_level;
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
 #define ipoib_dbg_data(priv, format, arg...)		\
-	do {					        \
+	do {						\
 		if (data_debug_level > 0)		\
 			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
 	} while (0)

+ 274 - 102
drivers/infiniband/ulp/ipoib/ipoib_cm.c

@@ -39,6 +39,15 @@
 #include <linux/icmpv6.h>
 #include <linux/delay.h>
 
+#include "ipoib.h"
+
+int ipoib_max_conn_qp = 128;
+
+module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
+MODULE_PARM_DESC(max_nonsrq_conn_qp,
+		 "Max number of connected-mode QPs per interface "
+		 "(applied only if shared receive queue is not available)");
+
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
 static int data_debug_level;
 
@@ -47,8 +56,6 @@ MODULE_PARM_DESC(cm_data_debug_level,
 		 "Enable data path debug tracing for connected mode if > 0");
 #endif
 
-#include "ipoib.h"
-
 #define IPOIB_CM_IETF_ID 0x1000000000000000ULL
 
 #define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
@@ -81,7 +88,7 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
 		ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
 }
 
-static int ipoib_cm_post_receive(struct net_device *dev, int id)
+static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_recv_wr *bad_wr;
@@ -89,13 +96,13 @@ static int ipoib_cm_post_receive(struct net_device *dev, int id)
 
 	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
 
-	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
+	for (i = 0; i < priv->cm.num_frags; ++i)
 		priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
 
 	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
 	if (unlikely(ret)) {
 		ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
-		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+		ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1,
 				      priv->cm.srq_ring[id].mapping);
 		dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
 		priv->cm.srq_ring[id].skb = NULL;
@@ -104,7 +111,33 @@ static int ipoib_cm_post_receive(struct net_device *dev, int id)
 	return ret;
 }
 
-static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, int frags,
+static int ipoib_cm_post_receive_nonsrq(struct net_device *dev,
+					struct ipoib_cm_rx *rx, int id)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_recv_wr *bad_wr;
+	int i, ret;
+
+	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
+
+	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
+		priv->cm.rx_sge[i].addr = rx->rx_ring[id].mapping[i];
+
+	ret = ib_post_recv(rx->qp, &priv->cm.rx_wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
+		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+				      rx->rx_ring[id].mapping);
+		dev_kfree_skb_any(rx->rx_ring[id].skb);
+		rx->rx_ring[id].skb = NULL;
+	}
+
+	return ret;
+}
+
+static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
+					     struct ipoib_cm_rx_buf *rx_ring,
+					     int id, int frags,
 					     u64 mapping[IPOIB_CM_RX_SG])
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -141,7 +174,7 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, int
 			goto partial_error;
 	}
 
-	priv->cm.srq_ring[id].skb = skb;
+	rx_ring[id].skb = skb;
 	return skb;
 
 partial_error:
@@ -155,7 +188,23 @@ partial_error:
 	return NULL;
 }
 
-static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv)
+static void ipoib_cm_free_rx_ring(struct net_device *dev,
+				  struct ipoib_cm_rx_buf *rx_ring)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < ipoib_recvq_size; ++i)
+		if (rx_ring[i].skb) {
+			ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+					      rx_ring[i].mapping);
+			dev_kfree_skb_any(rx_ring[i].skb);
+		}
+
+	kfree(rx_ring);
+}
+
+static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
 {
 	struct ib_send_wr *bad_wr;
 	struct ipoib_cm_rx *p;
@@ -208,12 +257,18 @@ static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
 		.qp_type = IB_QPT_RC,
 		.qp_context = p,
 	};
+
+	if (!ipoib_cm_has_srq(dev)) {
+		attr.cap.max_recv_wr  = ipoib_recvq_size;
+		attr.cap.max_recv_sge = IPOIB_CM_RX_SG;
+	}
+
 	return ib_create_qp(priv->pd, &attr);
 }
 
 static int ipoib_cm_modify_rx_qp(struct net_device *dev,
-				  struct ib_cm_id *cm_id, struct ib_qp *qp,
-				  unsigned psn)
+				 struct ib_cm_id *cm_id, struct ib_qp *qp,
+				 unsigned psn)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_attr qp_attr;
@@ -266,6 +321,60 @@ static int ipoib_cm_modify_rx_qp(struct net_device *dev,
 	return 0;
 }
 
+static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id,
+				   struct ipoib_cm_rx *rx)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+	int i;
+
+	rx->rx_ring = kcalloc(ipoib_recvq_size, sizeof *rx->rx_ring, GFP_KERNEL);
+	if (!rx->rx_ring)
+		return -ENOMEM;
+
+	spin_lock_irq(&priv->lock);
+
+	if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
+		spin_unlock_irq(&priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
+		ret = -EINVAL;
+		goto err_free;
+	} else
+		++priv->cm.nonsrq_conn_qp;
+
+	spin_unlock_irq(&priv->lock);
+
+	for (i = 0; i < ipoib_recvq_size; ++i) {
+		if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1,
+					   rx->rx_ring[i].mapping)) {
+			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+				ret = -ENOMEM;
+				goto err_count;
+			}
+		ret = ipoib_cm_post_receive_nonsrq(dev, rx, i);
+		if (ret) {
+			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
+				   "failed for buf %d\n", i);
+			ret = -EIO;
+			goto err_count;
+		}
+	}
+
+	rx->recv_count = ipoib_recvq_size;
+
+	return 0;
+
+err_count:
+	spin_lock_irq(&priv->lock);
+	--priv->cm.nonsrq_conn_qp;
+	spin_unlock_irq(&priv->lock);
+
+err_free:
+	ipoib_cm_free_rx_ring(dev, rx->rx_ring);
+
+	return ret;
+}
+
 static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
 			     struct ib_qp *qp, struct ib_cm_req_event_param *req,
 			     unsigned psn)
@@ -281,7 +390,7 @@ static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
 	rep.private_data_len = sizeof data;
 	rep.flow_control = 0;
 	rep.rnr_retry_count = req->rnr_retry_count;
-	rep.srq = 1;
+	rep.srq = ipoib_cm_has_srq(dev);
 	rep.qp_num = qp->qp_num;
 	rep.starting_psn = psn;
 	return ib_send_cm_rep(cm_id, &rep);
@@ -317,6 +426,12 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
 	if (ret)
 		goto err_modify;
 
+	if (!ipoib_cm_has_srq(dev)) {
+		ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p);
+		if (ret)
+			goto err_modify;
+	}
+
 	spin_lock_irq(&priv->lock);
 	queue_delayed_work(ipoib_workqueue,
 			   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
@@ -401,12 +516,14 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_rx_buf *rx_ring;
 	unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
 	struct sk_buff *skb, *newskb;
 	struct ipoib_cm_rx *p;
 	unsigned long flags;
 	u64 mapping[IPOIB_CM_RX_SG];
 	int frags;
+	int has_srq;
 
 	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
 		       wr_id, wc->status);
@@ -424,18 +541,32 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 		return;
 	}
 
-	skb  = priv->cm.srq_ring[wr_id].skb;
+	p = wc->qp->qp_context;
+
+	has_srq = ipoib_cm_has_srq(dev);
+	rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring;
+
+	skb = rx_ring[wr_id].skb;
 
 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
 		ipoib_dbg(priv, "cm recv error "
 			   "(status=%d, wrid=%d vend_err %x)\n",
 			   wc->status, wr_id, wc->vendor_err);
 		++dev->stats.rx_dropped;
-		goto repost;
+		if (has_srq)
+			goto repost;
+		else {
+			if (!--p->recv_count) {
+				spin_lock_irqsave(&priv->lock, flags);
+				list_move(&p->list, &priv->cm.rx_reap_list);
+				spin_unlock_irqrestore(&priv->lock, flags);
+				queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+			}
+			return;
+		}
 	}
 
 	if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
-		p = wc->qp->qp_context;
 		if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
 			spin_lock_irqsave(&priv->lock, flags);
 			p->jiffies = jiffies;
@@ -450,7 +581,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 	frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
 					      (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
 
-	newskb = ipoib_cm_alloc_rx_skb(dev, wr_id, frags, mapping);
+	newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping);
 	if (unlikely(!newskb)) {
 		/*
 		 * If we can't allocate a new RX buffer, dump
@@ -461,8 +592,8 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 		goto repost;
 	}
 
-	ipoib_cm_dma_unmap_rx(priv, frags, priv->cm.srq_ring[wr_id].mapping);
-	memcpy(priv->cm.srq_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping);
+	ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping);
+	memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping);
 
 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
 		       wc->byte_len, wc->slid);
@@ -483,9 +614,17 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 	netif_receive_skb(skb);
 
 repost:
-	if (unlikely(ipoib_cm_post_receive(dev, wr_id)))
-		ipoib_warn(priv, "ipoib_cm_post_receive failed "
-			   "for buf %d\n", wr_id);
+	if (has_srq) {
+		if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id)))
+			ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
+				   "for buf %d\n", wr_id);
+	} else {
+		if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p, wr_id))) {
+			--p->recv_count;
+			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
+				   "for buf %d\n", wr_id);
+		}
+	}
 }
 
 static inline int post_send(struct ipoib_dev_priv *priv,
@@ -495,10 +634,10 @@ static inline int post_send(struct ipoib_dev_priv *priv,
 {
 	struct ib_send_wr *bad_wr;
 
-	priv->tx_sge.addr             = addr;
-	priv->tx_sge.length           = len;
+	priv->tx_sge.addr	= addr;
+	priv->tx_sge.length	= len;
 
-	priv->tx_wr.wr_id 	      = wr_id | IPOIB_OP_CM;
+	priv->tx_wr.wr_id	= wr_id | IPOIB_OP_CM;
 
 	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
 }
@@ -540,7 +679,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
 	tx_req->mapping = addr;
 
 	if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
-			        addr, skb->len))) {
+			       addr, skb->len))) {
 		ipoib_warn(priv, "post_send failed\n");
 		++dev->stats.tx_errors;
 		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
@@ -657,10 +796,33 @@ err_cm:
 	return ret;
 }
 
+static void ipoib_cm_free_rx_reap_list(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_rx *rx, *n;
+	LIST_HEAD(list);
+
+	spin_lock_irq(&priv->lock);
+	list_splice_init(&priv->cm.rx_reap_list, &list);
+	spin_unlock_irq(&priv->lock);
+
+	list_for_each_entry_safe(rx, n, &list, list) {
+		ib_destroy_cm_id(rx->id);
+		ib_destroy_qp(rx->qp);
+		if (!ipoib_cm_has_srq(dev)) {
+			ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring);
+			spin_lock_irq(&priv->lock);
+			--priv->cm.nonsrq_conn_qp;
+			spin_unlock_irq(&priv->lock);
+		}
+		kfree(rx);
+	}
+}
+
 void ipoib_cm_dev_stop(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ipoib_cm_rx *p, *n;
+	struct ipoib_cm_rx *p;
 	unsigned long begin;
 	LIST_HEAD(list);
 	int ret;
@@ -706,15 +868,9 @@ void ipoib_cm_dev_stop(struct net_device *dev)
 		spin_lock_irq(&priv->lock);
 	}
 
-	list_splice_init(&priv->cm.rx_reap_list, &list);
-
 	spin_unlock_irq(&priv->lock);
 
-	list_for_each_entry_safe(p, n, &list, list) {
-		ib_destroy_cm_id(p->id);
-		ib_destroy_qp(p->qp);
-		kfree(p);
-	}
+	ipoib_cm_free_rx_reap_list(dev);
 
 	cancel_delayed_work(&priv->cm.stale_task);
 }
@@ -799,7 +955,7 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
 		.sq_sig_type		= IB_SIGNAL_ALL_WR,
 		.qp_type		= IB_QPT_RC,
 		.qp_context		= tx
-        };
+	};
 
 	return ib_create_qp(priv->pd, &attr);
 }
@@ -816,28 +972,28 @@ static int ipoib_cm_send_req(struct net_device *dev,
 	data.qpn = cpu_to_be32(priv->qp->qp_num);
 	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
 
-	req.primary_path 	      = pathrec;
-	req.alternate_path 	      = NULL;
-	req.service_id                = cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
-	req.qp_num 		      = qp->qp_num;
-	req.qp_type 		      = qp->qp_type;
-	req.private_data 	      = &data;
-	req.private_data_len 	      = sizeof data;
-	req.flow_control 	      = 0;
+	req.primary_path		= pathrec;
+	req.alternate_path		= NULL;
+	req.service_id			= cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
+	req.qp_num			= qp->qp_num;
+	req.qp_type			= qp->qp_type;
+	req.private_data		= &data;
+	req.private_data_len		= sizeof data;
+	req.flow_control		= 0;
 
-	req.starting_psn              = 0; /* FIXME */
+	req.starting_psn		= 0; /* FIXME */
 
 	/*
 	 * Pick some arbitrary defaults here; we could make these
 	 * module parameters if anyone cared about setting them.
 	 */
-	req.responder_resources	      = 4;
-	req.remote_cm_response_timeout = 20;
-	req.local_cm_response_timeout  = 20;
-	req.retry_count 	      = 0; /* RFC draft warns against retries */
-	req.rnr_retry_count 	      = 0; /* RFC draft warns against retries */
-	req.max_cm_retries 	      = 15;
-	req.srq 	              = 1;
+	req.responder_resources		= 4;
+	req.remote_cm_response_timeout	= 20;
+	req.local_cm_response_timeout	= 20;
+	req.retry_count			= 0; /* RFC draft warns against retries */
+	req.rnr_retry_count		= 0; /* RFC draft warns against retries */
+	req.max_cm_retries		= 15;
+	req.srq				= ipoib_cm_has_srq(dev);
 	return ib_send_cm_req(id, &req);
 }
 
@@ -1150,7 +1306,7 @@ static void ipoib_cm_skb_reap(struct work_struct *work)
 	spin_unlock_irq(&priv->tx_lock);
 }
 
-void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
+void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
 			   unsigned int mtu)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -1166,20 +1322,8 @@ void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
 
 static void ipoib_cm_rx_reap(struct work_struct *work)
 {
-	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
-						   cm.rx_reap_task);
-	struct ipoib_cm_rx *p, *n;
-	LIST_HEAD(list);
-
-	spin_lock_irq(&priv->lock);
-	list_splice_init(&priv->cm.rx_reap_list, &list);
-	spin_unlock_irq(&priv->lock);
-
-	list_for_each_entry_safe(p, n, &list, list) {
-		ib_destroy_cm_id(p->id);
-		ib_destroy_qp(p->qp);
-		kfree(p);
-	}
+	ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv,
+						cm.rx_reap_task)->dev);
 }
 
 static void ipoib_cm_stale_task(struct work_struct *work)
@@ -1212,7 +1356,7 @@ static void ipoib_cm_stale_task(struct work_struct *work)
 }
 
 
-static ssize_t show_mode(struct device *d, struct device_attribute *attr, 
+static ssize_t show_mode(struct device *d, struct device_attribute *attr,
 			 char *buf)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d));
@@ -1255,16 +1399,40 @@ int ipoib_cm_add_mode_attr(struct net_device *dev)
 	return device_create_file(&dev->dev, &dev_attr_mode);
 }
 
-int ipoib_cm_dev_init(struct net_device *dev)
+static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_srq_init_attr srq_init_attr = {
 		.attr = {
 			.max_wr  = ipoib_recvq_size,
-			.max_sge = IPOIB_CM_RX_SG
+			.max_sge = max_sge
 		}
 	};
-	int ret, i;
+
+	priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
+	if (IS_ERR(priv->cm.srq)) {
+		if (PTR_ERR(priv->cm.srq) != -ENOSYS)
+			printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n",
+			       priv->ca->name, PTR_ERR(priv->cm.srq));
+		priv->cm.srq = NULL;
+		return;
+	}
+
+	priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring,
+				    GFP_KERNEL);
+	if (!priv->cm.srq_ring) {
+		printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n",
+		       priv->ca->name, ipoib_recvq_size);
+		ib_destroy_srq(priv->cm.srq);
+		priv->cm.srq = NULL;
+	}
+}
+
+int ipoib_cm_dev_init(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i, ret;
+	struct ib_device_attr attr;
 
 	INIT_LIST_HEAD(&priv->cm.passive_ids);
 	INIT_LIST_HEAD(&priv->cm.reap_list);
@@ -1281,43 +1449,53 @@ int ipoib_cm_dev_init(struct net_device *dev)
 
 	skb_queue_head_init(&priv->cm.skb_queue);
 
-	priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
-	if (IS_ERR(priv->cm.srq)) {
-		ret = PTR_ERR(priv->cm.srq);
-		priv->cm.srq = NULL;
+	ret = ib_query_device(priv->ca, &attr);
+	if (ret) {
+		printk(KERN_WARNING "ib_query_device() failed with %d\n", ret);
 		return ret;
 	}
 
-	priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring,
-				    GFP_KERNEL);
-	if (!priv->cm.srq_ring) {
-		printk(KERN_WARNING "%s: failed to allocate CM ring (%d entries)\n",
-		       priv->ca->name, ipoib_recvq_size);
-		ipoib_cm_dev_cleanup(dev);
-		return -ENOMEM;
+	ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge);
+
+	attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge);
+	ipoib_cm_create_srq(dev, attr.max_srq_sge);
+	if (ipoib_cm_has_srq(dev)) {
+		priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10;
+		priv->cm.num_frags  = attr.max_srq_sge;
+		ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
+			  priv->cm.max_cm_mtu, priv->cm.num_frags);
+	} else {
+		priv->cm.max_cm_mtu = IPOIB_CM_MTU;
+		priv->cm.num_frags  = IPOIB_CM_RX_SG;
 	}
 
-	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
+	for (i = 0; i < priv->cm.num_frags; ++i)
 		priv->cm.rx_sge[i].lkey	= priv->mr->lkey;
 
 	priv->cm.rx_sge[0].length = IPOIB_CM_HEAD_SIZE;
-	for (i = 1; i < IPOIB_CM_RX_SG; ++i)
+	for (i = 1; i < priv->cm.num_frags; ++i)
 		priv->cm.rx_sge[i].length = PAGE_SIZE;
 	priv->cm.rx_wr.next = NULL;
 	priv->cm.rx_wr.sg_list = priv->cm.rx_sge;
-	priv->cm.rx_wr.num_sge = IPOIB_CM_RX_SG;
+	priv->cm.rx_wr.num_sge = priv->cm.num_frags;
+
+	if (ipoib_cm_has_srq(dev)) {
+		for (i = 0; i < ipoib_recvq_size; ++i) {
+			if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i,
+						   priv->cm.num_frags - 1,
+						   priv->cm.srq_ring[i].mapping)) {
+				ipoib_warn(priv, "failed to allocate "
+					   "receive buffer %d\n", i);
+				ipoib_cm_dev_cleanup(dev);
+				return -ENOMEM;
+			}
 
-	for (i = 0; i < ipoib_recvq_size; ++i) {
-		if (!ipoib_cm_alloc_rx_skb(dev, i, IPOIB_CM_RX_SG - 1,
-					   priv->cm.srq_ring[i].mapping)) {
-			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
-			ipoib_cm_dev_cleanup(dev);
-			return -ENOMEM;
-		}
-		if (ipoib_cm_post_receive(dev, i)) {
-			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
-			ipoib_cm_dev_cleanup(dev);
-			return -EIO;
+			if (ipoib_cm_post_receive_srq(dev, i)) {
+				ipoib_warn(priv, "ipoib_cm_post_receive_srq "
+					   "failed for buf %d\n", i);
+				ipoib_cm_dev_cleanup(dev);
+				return -EIO;
+			}
 		}
 	}
 
@@ -1328,7 +1506,7 @@ int ipoib_cm_dev_init(struct net_device *dev)
 void ipoib_cm_dev_cleanup(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	int i, ret;
+	int ret;
 
 	if (!priv->cm.srq)
 		return;
@@ -1342,13 +1520,7 @@ void ipoib_cm_dev_cleanup(struct net_device *dev)
 	priv->cm.srq = NULL;
 	if (!priv->cm.srq_ring)
 		return;
-	for (i = 0; i < ipoib_recvq_size; ++i)
-		if (priv->cm.srq_ring[i].skb) {
-			ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
-					      priv->cm.srq_ring[i].mapping);
-			dev_kfree_skb_any(priv->cm.srq_ring[i].skb);
-			priv->cm.srq_ring[i].skb = NULL;
-		}
-	kfree(priv->cm.srq_ring);
+
+	ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring);
 	priv->cm.srq_ring = NULL;
 }

+ 2 - 2
drivers/infiniband/ulp/ipoib/ipoib_fs.c

@@ -124,7 +124,7 @@ static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr)
 	return 0;
 }
 
-static struct seq_operations ipoib_mcg_seq_ops = {
+static const struct seq_operations ipoib_mcg_seq_ops = {
 	.start = ipoib_mcg_seq_start,
 	.next  = ipoib_mcg_seq_next,
 	.stop  = ipoib_mcg_seq_stop,
@@ -230,7 +230,7 @@ static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr)
 	return 0;
 }
 
-static struct seq_operations ipoib_path_seq_ops = {
+static const struct seq_operations ipoib_path_seq_ops = {
 	.start = ipoib_path_seq_start,
 	.next  = ipoib_path_seq_next,
 	.stop  = ipoib_path_seq_stop,

+ 4 - 4
drivers/infiniband/ulp/ipoib/ipoib_ib.c

@@ -345,12 +345,12 @@ static inline int post_send(struct ipoib_dev_priv *priv,
 {
 	struct ib_send_wr *bad_wr;
 
-	priv->tx_sge.addr             = addr;
-	priv->tx_sge.length           = len;
+	priv->tx_sge.addr	      = addr;
+	priv->tx_sge.length	      = len;
 
-	priv->tx_wr.wr_id 	      = wr_id;
+	priv->tx_wr.wr_id	      = wr_id;
 	priv->tx_wr.wr.ud.remote_qpn  = qpn;
-	priv->tx_wr.wr.ud.ah 	      = address;
+	priv->tx_wr.wr.ud.ah	      = address;
 
 	return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
 }

+ 28 - 32
drivers/infiniband/ulp/ipoib/ipoib_main.c

@@ -182,17 +182,20 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
 	/* dev->mtu > 2K ==> connected mode */
-	if (ipoib_cm_admin_enabled(dev) && new_mtu <= IPOIB_CM_MTU) {
+	if (ipoib_cm_admin_enabled(dev)) {
+		if (new_mtu > ipoib_cm_max_mtu(dev))
+			return -EINVAL;
+
 		if (new_mtu > priv->mcast_mtu)
 			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
 				   priv->mcast_mtu);
+
 		dev->mtu = new_mtu;
 		return 0;
 	}
 
-	if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) {
+	if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
 		return -EINVAL;
-	}
 
 	priv->admin_mtu = new_mtu;
 
@@ -474,8 +477,8 @@ static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
 	INIT_LIST_HEAD(&path->neigh_list);
 
 	memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
-	path->pathrec.sgid          = priv->local_gid;
-	path->pathrec.pkey          = cpu_to_be16(priv->pkey);
+	path->pathrec.sgid	    = priv->local_gid;
+	path->pathrec.pkey	    = cpu_to_be16(priv->pkey);
 	path->pathrec.numb_path     = 1;
 	path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
 
@@ -669,16 +672,6 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(!spin_trylock_irqsave(&priv->tx_lock, flags)))
 		return NETDEV_TX_LOCKED;
 
-	/*
-	 * Check if our queue is stopped.  Since we have the LLTX bit
-	 * set, we can't rely on netif_stop_queue() preventing our
-	 * xmit function from being called with a full queue.
-	 */
-	if (unlikely(netif_queue_stopped(dev))) {
-		spin_unlock_irqrestore(&priv->tx_lock, flags);
-		return NETDEV_TX_BUSY;
-	}
-
 	if (likely(skb->dst && skb->dst->neighbour)) {
 		if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) {
 			ipoib_path_lookup(skb, dev);
@@ -950,34 +943,34 @@ static void ipoib_setup(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
-	dev->open 		 = ipoib_open;
-	dev->stop 		 = ipoib_stop;
-	dev->change_mtu 	 = ipoib_change_mtu;
-	dev->hard_start_xmit 	 = ipoib_start_xmit;
-	dev->tx_timeout 	 = ipoib_timeout;
-	dev->header_ops 	 = &ipoib_header_ops;
-	dev->set_multicast_list  = ipoib_set_mcast_list;
-	dev->neigh_setup         = ipoib_neigh_setup_dev;
+	dev->open		 = ipoib_open;
+	dev->stop		 = ipoib_stop;
+	dev->change_mtu		 = ipoib_change_mtu;
+	dev->hard_start_xmit	 = ipoib_start_xmit;
+	dev->tx_timeout		 = ipoib_timeout;
+	dev->header_ops		 = &ipoib_header_ops;
+	dev->set_multicast_list	 = ipoib_set_mcast_list;
+	dev->neigh_setup	 = ipoib_neigh_setup_dev;
 
 	netif_napi_add(dev, &priv->napi, ipoib_poll, 100);
 
-	dev->watchdog_timeo 	 = HZ;
+	dev->watchdog_timeo	 = HZ;
 
-	dev->flags              |= IFF_BROADCAST | IFF_MULTICAST;
+	dev->flags		|= IFF_BROADCAST | IFF_MULTICAST;
 
 	/*
 	 * We add in INFINIBAND_ALEN to allow for the destination
 	 * address "pseudoheader" for skbs without neighbour struct.
 	 */
-	dev->hard_header_len 	 = IPOIB_ENCAP_LEN + INFINIBAND_ALEN;
-	dev->addr_len 		 = INFINIBAND_ALEN;
-	dev->type 		 = ARPHRD_INFINIBAND;
-	dev->tx_queue_len 	 = ipoib_sendq_size * 2;
-	dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
+	dev->hard_header_len	 = IPOIB_ENCAP_LEN + INFINIBAND_ALEN;
+	dev->addr_len		 = INFINIBAND_ALEN;
+	dev->type		 = ARPHRD_INFINIBAND;
+	dev->tx_queue_len	 = ipoib_sendq_size * 2;
+	dev->features		 = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
 
 	/* MTU will be reset when mcast join happens */
-	dev->mtu 		 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
-	priv->mcast_mtu 	 = priv->admin_mtu = dev->mtu;
+	dev->mtu		 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
+	priv->mcast_mtu		 = priv->admin_mtu = dev->mtu;
 
 	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
 
@@ -1268,6 +1261,9 @@ static int __init ipoib_init_module(void)
 	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
 	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
 	ipoib_sendq_size = max(ipoib_sendq_size, IPOIB_MIN_QUEUE_SIZE);
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
+#endif
 
 	ret = ipoib_register_debugfs();
 	if (ret)

+ 2 - 6
drivers/infiniband/ulp/ipoib/ipoib_multicast.c

@@ -702,7 +702,7 @@ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb)
 
 out:
 	if (mcast && mcast->ah) {
-		if (skb->dst            &&
+		if (skb->dst		&&
 		    skb->dst->neighbour &&
 		    !*to_ipoib_neigh(skb->dst->neighbour)) {
 			struct ipoib_neigh *neigh = ipoib_neigh_alloc(skb->dst->neighbour,
@@ -710,7 +710,7 @@ out:
 
 			if (neigh) {
 				kref_get(&mcast->ah->ref);
-				neigh->ah  	= mcast->ah;
+				neigh->ah	= mcast->ah;
 				list_add_tail(&neigh->list, &mcast->neigh_list);
 			}
 		}
@@ -788,10 +788,6 @@ void ipoib_mcast_restart_task(struct work_struct *work)
 
 		memcpy(mgid.raw, mclist->dmi_addr + 4, sizeof mgid);
 
-		/* Add in the P_Key */
-		mgid.raw[4] = (priv->pkey >> 8) & 0xff;
-		mgid.raw[5] = priv->pkey & 0xff;
-
 		mcast = __ipoib_mcast_find(dev, &mgid);
 		if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
 			struct ipoib_mcast *nmcast;

+ 11 - 7
drivers/infiniband/ulp/ipoib/ipoib_verbs.c

@@ -172,8 +172,12 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 
 	size = ipoib_sendq_size + ipoib_recvq_size + 1;
 	ret = ipoib_cm_dev_init(dev);
-	if (!ret)
-		size += ipoib_recvq_size + 1 /* 1 extra for rx_drain_qp */;
+	if (!ret) {
+		if (ipoib_cm_has_srq(dev))
+			size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */
+		else
+			size += ipoib_recvq_size * ipoib_max_conn_qp;
+	}
 
 	priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
 	if (IS_ERR(priv->cq)) {
@@ -197,12 +201,12 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 	priv->dev->dev_addr[2] = (priv->qp->qp_num >>  8) & 0xff;
 	priv->dev->dev_addr[3] = (priv->qp->qp_num      ) & 0xff;
 
-	priv->tx_sge.lkey 	= priv->mr->lkey;
+	priv->tx_sge.lkey	= priv->mr->lkey;
 
-	priv->tx_wr.opcode 	= IB_WR_SEND;
-	priv->tx_wr.sg_list 	= &priv->tx_sge;
-	priv->tx_wr.num_sge 	= 1;
-	priv->tx_wr.send_flags 	= IB_SEND_SIGNALED;
+	priv->tx_wr.opcode	= IB_WR_SEND;
+	priv->tx_wr.sg_list	= &priv->tx_sge;
+	priv->tx_wr.num_sge	= 1;
+	priv->tx_wr.send_flags	= IB_SEND_SIGNALED;
 
 	return 0;
 

+ 2 - 2
drivers/infiniband/ulp/iser/Kconfig

@@ -8,5 +8,5 @@ config INFINIBAND_ISER
           that speak iSCSI over iSER over InfiniBand.
 
 	  The iSER protocol is defined by IETF.
-	  See <http://www.ietf.org/internet-drafts/draft-ietf-ips-iser-05.txt>
-	  and <http://www.infinibandta.org/members/spec/iser_annex_060418.pdf>
+	  See <http://www.ietf.org/rfc/rfc5046.txt>
+	  and <http://www.infinibandta.org/members/spec/Annex_iSER.PDF>

+ 1 - 0
drivers/infiniband/ulp/iser/iscsi_iser.c

@@ -551,6 +551,7 @@ static struct scsi_host_template iscsi_iser_sht = {
 	.module                 = THIS_MODULE,
 	.name                   = "iSCSI Initiator over iSER, v." DRV_VER,
 	.queuecommand           = iscsi_queuecommand,
+	.change_queue_depth	= iscsi_change_queue_depth,
 	.can_queue		= ISCSI_DEF_XMIT_CMDS_MAX - 1,
 	.sg_tablesize           = ISCSI_ISER_SG_TABLESIZE,
 	.max_sectors		= 1024,

+ 1 - 1
drivers/infiniband/ulp/iser/iser_initiator.c

@@ -561,7 +561,7 @@ void iser_rcv_completion(struct iser_desc *rx_desc,
 	if (opcode == ISCSI_OP_SCSI_CMD_RSP) {
 	        itt = get_itt(hdr->itt); /* mask out cid and age bits */
 		if (!(itt < session->cmds_max))
-			iser_err("itt can't be matched to task!!!"
+			iser_err("itt can't be matched to task!!! "
 				 "conn %p opcode %d cmds_max %d itt %d\n",
 				 conn->iscsi_conn,opcode,session->cmds_max,itt);
 		/* use the mapping given with the cmds array indexed by itt */

+ 3 - 5
drivers/infiniband/ulp/iser/iser_verbs.c

@@ -105,7 +105,7 @@ pd_err:
 }
 
 /**
- * iser_free_device_ib_res - destory/dealloc/dereg the DMA MR,
+ * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR,
  * CQ and PD created with the device associated with the adapator.
  */
 static void iser_free_device_ib_res(struct iser_device *device)
@@ -475,13 +475,11 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
 		iser_disconnected_handler(cma_id);
 		break;
 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		iser_err("Device removal is currently unsupported\n");
 		BUG();
 		break;
-	case RDMA_CM_EVENT_CONNECT_RESPONSE:
-		BUG();
-		break;
-	case RDMA_CM_EVENT_CONNECT_REQUEST:
 	default:
+		iser_err("Unexpected RDMA CM event (%d)\n", event->event);
 		break;
 	}
 	return ret;

+ 81 - 50
drivers/infiniband/ulp/srp/ib_srp.c

@@ -272,7 +272,8 @@ static void srp_path_rec_completion(int status,
 
 	target->status = status;
 	if (status)
-		printk(KERN_ERR PFX "Got failed path rec status %d\n", status);
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Got failed path rec status %d\n", status);
 	else
 		target->path = *pathrec;
 	complete(&target->done);
@@ -303,7 +304,8 @@ static int srp_lookup_path(struct srp_target_port *target)
 	wait_for_completion(&target->done);
 
 	if (target->status < 0)
-		printk(KERN_WARNING PFX "Path record query failed\n");
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Path record query failed\n");
 
 	return target->status;
 }
@@ -379,9 +381,10 @@ static int srp_send_req(struct srp_target_port *target)
 	 * the second 8 bytes to the local node GUID.
 	 */
 	if (srp_target_is_topspin(target)) {
-		printk(KERN_DEBUG PFX "Topspin/Cisco initiator port ID workaround "
-		       "activated for target GUID %016llx\n",
-		       (unsigned long long) be64_to_cpu(target->ioc_guid));
+		shost_printk(KERN_DEBUG, target->scsi_host,
+			     PFX "Topspin/Cisco initiator port ID workaround "
+			     "activated for target GUID %016llx\n",
+			     (unsigned long long) be64_to_cpu(target->ioc_guid));
 		memset(req->priv.initiator_port_id, 0, 8);
 		memcpy(req->priv.initiator_port_id + 8,
 		       &target->srp_host->dev->dev->node_guid, 8);
@@ -400,7 +403,8 @@ static void srp_disconnect_target(struct srp_target_port *target)
 
 	init_completion(&target->done);
 	if (ib_send_cm_dreq(target->cm_id, NULL, 0)) {
-		printk(KERN_DEBUG PFX "Sending CM DREQ failed\n");
+		shost_printk(KERN_DEBUG, target->scsi_host,
+			     PFX "Sending CM DREQ failed\n");
 		return;
 	}
 	wait_for_completion(&target->done);
@@ -568,7 +572,8 @@ static int srp_reconnect_target(struct srp_target_port *target)
 	return ret;
 
 err:
-	printk(KERN_ERR PFX "reconnect failed (%d), removing target port.\n", ret);
+	shost_printk(KERN_ERR, target->scsi_host,
+		     PFX "reconnect failed (%d), removing target port.\n", ret);
 
 	/*
 	 * We couldn't reconnect, so kill our target port off.
@@ -683,8 +688,9 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target,
 
 	if (scmnd->sc_data_direction != DMA_FROM_DEVICE &&
 	    scmnd->sc_data_direction != DMA_TO_DEVICE) {
-		printk(KERN_WARNING PFX "Unhandled data direction %d\n",
-		       scmnd->sc_data_direction);
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled data direction %d\n",
+			     scmnd->sc_data_direction);
 		return -EINVAL;
 	}
 
@@ -786,8 +792,9 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp)
 	} else {
 		scmnd = req->scmnd;
 		if (!scmnd)
-			printk(KERN_ERR "Null scmnd for RSP w/tag %016llx\n",
-			       (unsigned long long) rsp->tag);
+			shost_printk(KERN_ERR, target->scsi_host,
+				     "Null scmnd for RSP w/tag %016llx\n",
+				     (unsigned long long) rsp->tag);
 		scmnd->result = rsp->status;
 
 		if (rsp->flags & SRP_RSP_FLAG_SNSVALID) {
@@ -831,7 +838,8 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc)
 	if (0) {
 		int i;
 
-		printk(KERN_ERR PFX "recv completion, opcode 0x%02x\n", opcode);
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "recv completion, opcode 0x%02x\n", opcode);
 
 		for (i = 0; i < wc->byte_len; ++i) {
 			if (i % 8 == 0)
@@ -852,11 +860,13 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc)
 
 	case SRP_T_LOGOUT:
 		/* XXX Handle target logout */
-		printk(KERN_WARNING PFX "Got target logout request\n");
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Got target logout request\n");
 		break;
 
 	default:
-		printk(KERN_WARNING PFX "Unhandled SRP opcode 0x%02x\n", opcode);
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled SRP opcode 0x%02x\n", opcode);
 		break;
 	}
 
@@ -872,9 +882,10 @@ static void srp_completion(struct ib_cq *cq, void *target_ptr)
 	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 	while (ib_poll_cq(cq, 1, &wc) > 0) {
 		if (wc.status) {
-			printk(KERN_ERR PFX "failed %s status %d\n",
-			       wc.wr_id & SRP_OP_RECV ? "receive" : "send",
-			       wc.status);
+			shost_printk(KERN_ERR, target->scsi_host,
+				     PFX "failed %s status %d\n",
+				     wc.wr_id & SRP_OP_RECV ? "receive" : "send",
+				     wc.status);
 			target->qp_in_error = 1;
 			break;
 		}
@@ -930,13 +941,18 @@ static int srp_post_recv(struct srp_target_port *target)
  * req_lim and tx_head.  Lock cannot be dropped between call here and
  * call to __srp_post_send().
  */
-static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target)
+static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target,
+					enum srp_request_type req_type)
 {
+	s32 min = (req_type == SRP_REQ_TASK_MGMT) ? 1 : 2;
+
 	if (target->tx_head - target->tx_tail >= SRP_SQ_SIZE)
 		return NULL;
 
-	if (unlikely(target->req_lim < 1))
+	if (target->req_lim < min) {
 		++target->zero_req_lim;
+		return NULL;
+	}
 
 	return target->tx_ring[target->tx_head & SRP_SQ_SIZE];
 }
@@ -993,7 +1009,7 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd,
 		return 0;
 	}
 
-	iu = __srp_get_tx_iu(target);
+	iu = __srp_get_tx_iu(target, SRP_REQ_NORMAL);
 	if (!iu)
 		goto err;
 
@@ -1022,12 +1038,13 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd,
 
 	len = srp_map_data(scmnd, target, req);
 	if (len < 0) {
-		printk(KERN_ERR PFX "Failed to map data\n");
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Failed to map data\n");
 		goto err;
 	}
 
 	if (__srp_post_recv(target)) {
-		printk(KERN_ERR PFX "Recv failed\n");
+		shost_printk(KERN_ERR, target->scsi_host, PFX "Recv failed\n");
 		goto err_unmap;
 	}
 
@@ -1035,7 +1052,7 @@ static int srp_queuecommand(struct scsi_cmnd *scmnd,
 				      DMA_TO_DEVICE);
 
 	if (__srp_post_send(target, iu, len)) {
-		printk(KERN_ERR PFX "Send failed\n");
+		shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n");
 		goto err_unmap;
 	}
 
@@ -1090,6 +1107,7 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id,
 			       struct ib_cm_event *event,
 			       struct srp_target_port *target)
 {
+	struct Scsi_Host *shost = target->scsi_host;
 	struct ib_class_port_info *cpi;
 	int opcode;
 
@@ -1115,19 +1133,22 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id,
 			memcpy(target->path.dgid.raw,
 			       event->param.rej_rcvd.ari, 16);
 
-			printk(KERN_DEBUG PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n",
-			       (unsigned long long) be64_to_cpu(target->path.dgid.global.subnet_prefix),
-			       (unsigned long long) be64_to_cpu(target->path.dgid.global.interface_id));
+			shost_printk(KERN_DEBUG, shost,
+				     PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n",
+				     (unsigned long long) be64_to_cpu(target->path.dgid.global.subnet_prefix),
+				     (unsigned long long) be64_to_cpu(target->path.dgid.global.interface_id));
 
 			target->status = SRP_PORT_REDIRECT;
 		} else {
-			printk(KERN_WARNING "  REJ reason: IB_CM_REJ_PORT_REDIRECT\n");
+			shost_printk(KERN_WARNING, shost,
+				     "  REJ reason: IB_CM_REJ_PORT_REDIRECT\n");
 			target->status = -ECONNRESET;
 		}
 		break;
 
 	case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID:
-		printk(KERN_WARNING "  REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n");
+		shost_printk(KERN_WARNING, shost,
+			    "  REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n");
 		target->status = -ECONNRESET;
 		break;
 
@@ -1138,20 +1159,21 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id,
 			u32 reason = be32_to_cpu(rej->reason);
 
 			if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE)
-				printk(KERN_WARNING PFX
-				       "SRP_LOGIN_REJ: requested max_it_iu_len too large\n");
+				shost_printk(KERN_WARNING, shost,
+					     PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n");
 			else
-				printk(KERN_WARNING PFX
-				       "SRP LOGIN REJECTED, reason 0x%08x\n", reason);
+				shost_printk(KERN_WARNING, shost,
+					    PFX "SRP LOGIN REJECTED, reason 0x%08x\n", reason);
 		} else
-			printk(KERN_WARNING "  REJ reason: IB_CM_REJ_CONSUMER_DEFINED,"
-			       " opcode 0x%02x\n", opcode);
+			shost_printk(KERN_WARNING, shost,
+				     "  REJ reason: IB_CM_REJ_CONSUMER_DEFINED,"
+				     " opcode 0x%02x\n", opcode);
 		target->status = -ECONNRESET;
 		break;
 
 	default:
-		printk(KERN_WARNING "  REJ reason 0x%x\n",
-		       event->param.rej_rcvd.reason);
+		shost_printk(KERN_WARNING, shost, "  REJ reason 0x%x\n",
+			     event->param.rej_rcvd.reason);
 		target->status = -ECONNRESET;
 	}
 }
@@ -1166,7 +1188,8 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 
 	switch (event->event) {
 	case IB_CM_REQ_ERROR:
-		printk(KERN_DEBUG PFX "Sending CM REQ failed\n");
+		shost_printk(KERN_DEBUG, target->scsi_host,
+			     PFX "Sending CM REQ failed\n");
 		comp = 1;
 		target->status = -ECONNRESET;
 		break;
@@ -1184,7 +1207,8 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 			target->scsi_host->can_queue = min(target->req_lim,
 							   target->scsi_host->can_queue);
 		} else {
-			printk(KERN_WARNING PFX "Unhandled RSP opcode %#x\n", opcode);
+			shost_printk(KERN_WARNING, target->scsi_host,
+				    PFX "Unhandled RSP opcode %#x\n", opcode);
 			target->status = -ECONNRESET;
 			break;
 		}
@@ -1230,20 +1254,23 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 		break;
 
 	case IB_CM_REJ_RECEIVED:
-		printk(KERN_DEBUG PFX "REJ received\n");
+		shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n");
 		comp = 1;
 
 		srp_cm_rej_handler(cm_id, event, target);
 		break;
 
 	case IB_CM_DREQ_RECEIVED:
-		printk(KERN_WARNING PFX "DREQ received - connection closed\n");
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "DREQ received - connection closed\n");
 		if (ib_send_cm_drep(cm_id, NULL, 0))
-			printk(KERN_ERR PFX "Sending CM DREP failed\n");
+			shost_printk(KERN_ERR, target->scsi_host,
+				     PFX "Sending CM DREP failed\n");
 		break;
 
 	case IB_CM_TIMEWAIT_EXIT:
-		printk(KERN_ERR PFX "connection closed\n");
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "connection closed\n");
 
 		comp = 1;
 		target->status = 0;
@@ -1255,7 +1282,8 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
 		break;
 
 	default:
-		printk(KERN_WARNING PFX "Unhandled CM event %d\n", event->event);
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled CM event %d\n", event->event);
 		break;
 	}
 
@@ -1283,7 +1311,7 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target,
 
 	init_completion(&req->done);
 
-	iu = __srp_get_tx_iu(target);
+	iu = __srp_get_tx_iu(target, SRP_REQ_TASK_MGMT);
 	if (!iu)
 		goto out;
 
@@ -1332,7 +1360,7 @@ static int srp_abort(struct scsi_cmnd *scmnd)
 	struct srp_request *req;
 	int ret = SUCCESS;
 
-	printk(KERN_ERR "SRP abort called\n");
+	shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n");
 
 	if (target->qp_in_error)
 		return FAILED;
@@ -1362,7 +1390,7 @@ static int srp_reset_device(struct scsi_cmnd *scmnd)
 	struct srp_target_port *target = host_to_target(scmnd->device->host);
 	struct srp_request *req, *tmp;
 
-	printk(KERN_ERR "SRP reset_device called\n");
+	shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n");
 
 	if (target->qp_in_error)
 		return FAILED;
@@ -1389,7 +1417,7 @@ static int srp_reset_host(struct scsi_cmnd *scmnd)
 	struct srp_target_port *target = host_to_target(scmnd->device->host);
 	int ret = FAILED;
 
-	printk(KERN_ERR PFX "SRP reset_host called\n");
+	shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n");
 
 	if (!srp_reconnect_target(target))
 		ret = SUCCESS;
@@ -1543,6 +1571,7 @@ static struct scsi_host_template srp_template = {
 	.this_id			= -1,
 	.cmd_per_lun			= SRP_SQ_SIZE,
 	.use_clustering			= ENABLE_CLUSTERING,
+	.use_sg_chaining		= ENABLE_SG_CHAINING,
 	.shost_attrs			= srp_host_attrs
 };
 
@@ -1814,8 +1843,9 @@ static ssize_t srp_create_target(struct class_device *class_dev,
 
 	ib_get_cached_gid(host->dev->dev, host->port, 0, &target->path.sgid);
 
-	printk(KERN_DEBUG PFX "new target: id_ext %016llx ioc_guid %016llx pkey %04x "
-	       "service_id %016llx dgid %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+	shost_printk(KERN_DEBUG, target->scsi_host, PFX
+		     "new target: id_ext %016llx ioc_guid %016llx pkey %04x "
+		     "service_id %016llx dgid %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
 	       (unsigned long long) be64_to_cpu(target->id_ext),
 	       (unsigned long long) be64_to_cpu(target->ioc_guid),
 	       be16_to_cpu(target->path.pkey),
@@ -1842,7 +1872,8 @@ static ssize_t srp_create_target(struct class_device *class_dev,
 	target->qp_in_error = 0;
 	ret = srp_connect_target(target);
 	if (ret) {
-		printk(KERN_ERR PFX "Connection failed\n");
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Connection failed\n");
 		goto err_cm_id;
 	}
 

+ 5 - 0
drivers/infiniband/ulp/srp/ib_srp.h

@@ -79,6 +79,11 @@ enum srp_target_state {
 	SRP_TARGET_REMOVED
 };
 
+enum srp_request_type {
+	SRP_REQ_NORMAL,
+	SRP_REQ_TASK_MGMT,
+};
+
 struct srp_device {
 	struct list_head	dev_list;
 	struct ib_device       *dev;

+ 1 - 1
drivers/net/mlx4/fw.c

@@ -202,7 +202,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_EQ_OFFSET);
 	dev_cap->reserved_eqs = 1 << (field & 0xf);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_EQ_OFFSET);
-	dev_cap->max_eqs = 1 << (field & 0x7);
+	dev_cap->max_eqs = 1 << (field & 0xf);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MTT_OFFSET);
 	dev_cap->reserved_mtts = 1 << (field >> 4);
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET);

+ 7 - 4
include/net/if_inet6.h

@@ -269,18 +269,21 @@ static inline void ipv6_arcnet_mc_map(const struct in6_addr *addr, char *buf)
 	buf[0] = 0x00;
 }
 
-static inline void ipv6_ib_mc_map(struct in6_addr *addr, char *buf)
+static inline void ipv6_ib_mc_map(const struct in6_addr *addr,
+				  const unsigned char *broadcast, char *buf)
 {
+	unsigned char scope = broadcast[5] & 0xF;
+
 	buf[0]  = 0;		/* Reserved */
 	buf[1]  = 0xff;		/* Multicast QPN */
 	buf[2]  = 0xff;
 	buf[3]  = 0xff;
 	buf[4]  = 0xff;
-	buf[5]  = 0x12;		/* link local scope */
+	buf[5]  = 0x10 | scope;	/* scope from broadcast address */
 	buf[6]  = 0x60;		/* IPv6 signature */
 	buf[7]  = 0x1b;
-	buf[8]  = 0;		/* P_Key */
-	buf[9]  = 0;
+	buf[8]  = broadcast[8];	/* P_Key */
+	buf[9]  = broadcast[9];
 	memcpy(buf + 10, addr->s6_addr + 6, 10);
 }
 #endif

+ 6 - 4
include/net/ip.h

@@ -266,20 +266,22 @@ static inline void ip_eth_mc_map(__be32 naddr, char *buf)
  *	Leave P_Key as 0 to be filled in by driver.
  */
 
-static inline void ip_ib_mc_map(__be32 naddr, char *buf)
+static inline void ip_ib_mc_map(__be32 naddr, const unsigned char *broadcast, char *buf)
 {
 	__u32 addr;
+	unsigned char scope = broadcast[5] & 0xF;
+
 	buf[0]  = 0;		/* Reserved */
 	buf[1]  = 0xff;		/* Multicast QPN */
 	buf[2]  = 0xff;
 	buf[3]  = 0xff;
 	addr    = ntohl(naddr);
 	buf[4]  = 0xff;
-	buf[5]  = 0x12;		/* link local scope */
+	buf[5]  = 0x10 | scope;	/* scope from broadcast address */
 	buf[6]  = 0x40;		/* IPv4 signature */
 	buf[7]  = 0x1b;
-	buf[8]  = 0;		/* P_Key */
-	buf[9]  = 0;
+	buf[8]  = broadcast[8];		/* P_Key */
+	buf[9]  = broadcast[9];
 	buf[10] = 0;
 	buf[11] = 0;
 	buf[12] = 0;

+ 3 - 1
include/rdma/ib_mad.h

@@ -230,7 +230,9 @@ struct ib_class_port_info
  * @seg_count: The number of RMPP segments allocated for this send.
  * @seg_size: Size of each RMPP segment.
  * @timeout_ms: Time to wait for a response.
- * @retries: Number of times to retry a request for a response.
+ * @retries: Number of times to retry a request for a response.  For MADs
+ *   using RMPP, this applies per window.  On completion, returns the number
+ *   of retries needed to complete the transfer.
  *
  * Users are responsible for initializing the MAD buffer itself, with the
  * exception of any RMPP header.  Additional segment buffer space allocated

+ 12 - 1
include/rdma/rdma_user_cm.h

@@ -60,7 +60,8 @@ enum {
 	RDMA_USER_CM_CMD_SET_OPTION,
 	RDMA_USER_CM_CMD_NOTIFY,
 	RDMA_USER_CM_CMD_JOIN_MCAST,
-	RDMA_USER_CM_CMD_LEAVE_MCAST
+	RDMA_USER_CM_CMD_LEAVE_MCAST,
+	RDMA_USER_CM_CMD_MIGRATE_ID
 };
 
 /*
@@ -230,4 +231,14 @@ struct rdma_ucm_set_option {
 	__u32 optlen;
 };
 
+struct rdma_ucm_migrate_id {
+	__u64 response;
+	__u32 id;
+	__u32 fd;
+};
+
+struct rdma_ucm_migrate_resp {
+	__u32 events_reported;
+};
+
 #endif /* RDMA_USER_CM_H */

+ 1 - 1
net/ipv4/arp.c

@@ -211,7 +211,7 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
 		ip_tr_mc_map(addr, haddr);
 		return 0;
 	case ARPHRD_INFINIBAND:
-		ip_ib_mc_map(addr, haddr);
+		ip_ib_mc_map(addr, dev->broadcast, haddr);
 		return 0;
 	default:
 		if (dir) {

+ 1 - 1
net/ipv6/ndisc.c

@@ -337,7 +337,7 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d
 		ipv6_arcnet_mc_map(addr, buf);
 		return 0;
 	case ARPHRD_INFINIBAND:
-		ipv6_ib_mc_map(addr, buf);
+		ipv6_ib_mc_map(addr, dev->broadcast, buf);
 		return 0;
 	default:
 		if (dir) {