Browse Source

Merge branches 'cxgb4', 'flowsteer', 'ipoib', 'iser', 'mlx4', 'ocrdma' and 'qib' into for-next

Roland Dreier 12 years ago
34 changed files with 2765 additions and 850 deletions
  1. 4 0
      drivers/infiniband/core/uverbs.h
  2. 248 2
      drivers/infiniband/core/uverbs_cmd.c
  3. 36 6
      drivers/infiniband/core/uverbs_main.c
  4. 30 0
      drivers/infiniband/core/verbs.c
  5. 235 0
      drivers/infiniband/hw/mlx4/main.c
  6. 12 0
      drivers/infiniband/hw/mlx4/mlx4_ib.h
  7. 12 10
      drivers/infiniband/hw/ocrdma/ocrdma.h
  8. 18 14
      drivers/infiniband/hw/ocrdma/ocrdma_abi.h
  9. 8 6
      drivers/infiniband/hw/ocrdma/ocrdma_ah.c
  10. 207 251
      drivers/infiniband/hw/ocrdma/ocrdma_hw.c
  11. 9 4
      drivers/infiniband/hw/ocrdma/ocrdma_hw.h
  12. 7 0
      drivers/infiniband/hw/ocrdma/ocrdma_main.c
  13. 136 74
      drivers/infiniband/hw/ocrdma/ocrdma_sli.h
  14. 353 147
      drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
  15. 6 0
      drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
  16. 4 1
      drivers/infiniband/hw/qib/qib.h
  17. 31 1
      drivers/infiniband/hw/qib/qib_common.h
  18. 1 1
      drivers/infiniband/hw/qib/qib_file_ops.c
  19. 1 1
      drivers/infiniband/hw/qib/qib_init.c
  20. 2 1
      drivers/infiniband/hw/qib/qib_mad.h
  21. 5 5
      drivers/infiniband/hw/qib/qib_pcie.c
  22. 7 1
      drivers/infiniband/hw/qib/qib_sdma.c
  23. 575 142
      drivers/infiniband/hw/qib/qib_user_sdma.c
  24. 0 3
      drivers/infiniband/ulp/ipoib/ipoib_cm.c
  25. 3 6
      drivers/infiniband/ulp/ipoib/ipoib_main.c
  26. 12 7
      drivers/infiniband/ulp/iser/iscsi_iser.c
  27. 59 14
      drivers/infiniband/ulp/iser/iscsi_iser.h
  28. 114 25
      drivers/infiniband/ulp/iser/iser_initiator.c
  29. 193 38
      drivers/infiniband/ulp/iser/iser_memory.c
  30. 211 81
      drivers/infiniband/ulp/iser/iser_verbs.c
  31. 2 1
      drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
  32. 0 5
      include/linux/mlx4/device.h
  33. 126 2
      include/rdma/ib_verbs.h
  34. 98 1
      include/uapi/rdma/ib_user_verbs.h

+ 4 - 0
drivers/infiniband/core/uverbs.h

@@ -135,6 +135,7 @@ struct ib_usrq_object {
 struct ib_uqp_object {
 	struct ib_uevent_object	uevent;
 	struct list_head 	mcast_list;
+	struct ib_uxrcd_object *uxrcd;
 };
 
 struct ib_ucq_object {
@@ -155,6 +156,7 @@ extern struct idr ib_uverbs_cq_idr;
 extern struct idr ib_uverbs_qp_idr;
 extern struct idr ib_uverbs_srq_idr;
 extern struct idr ib_uverbs_xrcd_idr;
+extern struct idr ib_uverbs_rule_idr;
 
 void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
 
@@ -215,5 +217,7 @@ IB_UVERBS_DECLARE_CMD(destroy_srq);
 IB_UVERBS_DECLARE_CMD(create_xsrq);
 IB_UVERBS_DECLARE_CMD(open_xrcd);
 IB_UVERBS_DECLARE_CMD(close_xrcd);
+IB_UVERBS_DECLARE_CMD(create_flow);
+IB_UVERBS_DECLARE_CMD(destroy_flow);
 
 #endif /* UVERBS_H */

+ 248 - 2
drivers/infiniband/core/uverbs_cmd.c

@@ -54,6 +54,7 @@ static struct uverbs_lock_class qp_lock_class	= { .name = "QP-uobj" };
 static struct uverbs_lock_class ah_lock_class	= { .name = "AH-uobj" };
 static struct uverbs_lock_class srq_lock_class	= { .name = "SRQ-uobj" };
 static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" };
+static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" };
 
 #define INIT_UDATA(udata, ibuf, obuf, ilen, olen)			\
 	do {								\
@@ -330,6 +331,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
 	INIT_LIST_HEAD(&ucontext->srq_list);
 	INIT_LIST_HEAD(&ucontext->ah_list);
 	INIT_LIST_HEAD(&ucontext->xrcd_list);
+	INIT_LIST_HEAD(&ucontext->rule_list);
 	ucontext->closing = 0;
 
 	resp.num_comp_vectors = file->device->num_comp_vectors;
@@ -1526,7 +1528,7 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
 		   (unsigned long) cmd.response + sizeof resp,
 		   in_len - sizeof cmd, out_len - sizeof resp);
 
-	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	obj = kzalloc(sizeof *obj, GFP_KERNEL);
 	if (!obj)
 		return -ENOMEM;
 
@@ -1642,8 +1644,13 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
 		goto err_copy;
 	}
 
-	if (xrcd)
+	if (xrcd) {
+		obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object,
+					  uobject);
+		atomic_inc(&obj->uxrcd->refcnt);
 		put_xrcd_read(xrcd_uobj);
+	}
+
 	if (pd)
 		put_pd_read(pd);
 	if (scq)
@@ -1753,6 +1760,8 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
 		goto err_remove;
 	}
 
+	obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+	atomic_inc(&obj->uxrcd->refcnt);
 	put_xrcd_read(xrcd_uobj);
 
 	mutex_lock(&file->mutex);
@@ -2019,6 +2028,9 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
 	if (ret)
 		return ret;
 
+	if (obj->uxrcd)
+		atomic_dec(&obj->uxrcd->refcnt);
+
 	idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
 
 	mutex_lock(&file->mutex);
@@ -2587,6 +2599,232 @@ out_put:
 	return ret ? ret : in_len;
 }
 
+static int kern_spec_to_ib_spec(struct ib_kern_spec *kern_spec,
+				union ib_flow_spec *ib_spec)
+{
+	ib_spec->type = kern_spec->type;
+
+	switch (ib_spec->type) {
+	case IB_FLOW_SPEC_ETH:
+		ib_spec->eth.size = sizeof(struct ib_flow_spec_eth);
+		if (ib_spec->eth.size != kern_spec->eth.size)
+			return -EINVAL;
+		memcpy(&ib_spec->eth.val, &kern_spec->eth.val,
+		       sizeof(struct ib_flow_eth_filter));
+		memcpy(&ib_spec->eth.mask, &kern_spec->eth.mask,
+		       sizeof(struct ib_flow_eth_filter));
+		break;
+	case IB_FLOW_SPEC_IPV4:
+		ib_spec->ipv4.size = sizeof(struct ib_flow_spec_ipv4);
+		if (ib_spec->ipv4.size != kern_spec->ipv4.size)
+			return -EINVAL;
+		memcpy(&ib_spec->ipv4.val, &kern_spec->ipv4.val,
+		       sizeof(struct ib_flow_ipv4_filter));
+		memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask,
+		       sizeof(struct ib_flow_ipv4_filter));
+		break;
+	case IB_FLOW_SPEC_TCP:
+	case IB_FLOW_SPEC_UDP:
+		ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp);
+		if (ib_spec->tcp_udp.size != kern_spec->tcp_udp.size)
+			return -EINVAL;
+		memcpy(&ib_spec->tcp_udp.val, &kern_spec->tcp_udp.val,
+		       sizeof(struct ib_flow_tcp_udp_filter));
+		memcpy(&ib_spec->tcp_udp.mask, &kern_spec->tcp_udp.mask,
+		       sizeof(struct ib_flow_tcp_udp_filter));
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file,
+			      const char __user *buf, int in_len,
+			      int out_len)
+{
+	struct ib_uverbs_create_flow	  cmd;
+	struct ib_uverbs_create_flow_resp resp;
+	struct ib_uobject		  *uobj;
+	struct ib_flow			  *flow_id;
+	struct ib_kern_flow_attr	  *kern_flow_attr;
+	struct ib_flow_attr		  *flow_attr;
+	struct ib_qp			  *qp;
+	int err = 0;
+	void *kern_spec;
+	void *ib_spec;
+	int i;
+	int kern_attr_size;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	if (cmd.comp_mask)
+		return -EINVAL;
+
+	if ((cmd.flow_attr.type == IB_FLOW_ATTR_SNIFFER &&
+	     !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW))
+		return -EPERM;
+
+	if (cmd.flow_attr.num_of_specs < 0 ||
+	    cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
+		return -EINVAL;
+
+	kern_attr_size = cmd.flow_attr.size - sizeof(cmd) -
+			 sizeof(struct ib_uverbs_cmd_hdr_ex);
+
+	if (cmd.flow_attr.size < 0 || cmd.flow_attr.size > in_len ||
+	    kern_attr_size < 0 || kern_attr_size >
+	    (cmd.flow_attr.num_of_specs * sizeof(struct ib_kern_spec)))
+		return -EINVAL;
+
+	if (cmd.flow_attr.num_of_specs) {
+		kern_flow_attr = kmalloc(cmd.flow_attr.size, GFP_KERNEL);
+		if (!kern_flow_attr)
+			return -ENOMEM;
+
+		memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr));
+		if (copy_from_user(kern_flow_attr + 1, buf + sizeof(cmd),
+				   kern_attr_size)) {
+			err = -EFAULT;
+			goto err_free_attr;
+		}
+	} else {
+		kern_flow_attr = &cmd.flow_attr;
+		kern_attr_size = sizeof(cmd.flow_attr);
+	}
+
+	uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
+	if (!uobj) {
+		err = -ENOMEM;
+		goto err_free_attr;
+	}
+	init_uobj(uobj, 0, file->ucontext, &rule_lock_class);
+	down_write(&uobj->mutex);
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp) {
+		err = -EINVAL;
+		goto err_uobj;
+	}
+
+	flow_attr = kmalloc(cmd.flow_attr.size, GFP_KERNEL);
+	if (!flow_attr) {
+		err = -ENOMEM;
+		goto err_put;
+	}
+
+	flow_attr->type = kern_flow_attr->type;
+	flow_attr->priority = kern_flow_attr->priority;
+	flow_attr->num_of_specs = kern_flow_attr->num_of_specs;
+	flow_attr->port = kern_flow_attr->port;
+	flow_attr->flags = kern_flow_attr->flags;
+	flow_attr->size = sizeof(*flow_attr);
+
+	kern_spec = kern_flow_attr + 1;
+	ib_spec = flow_attr + 1;
+	for (i = 0; i < flow_attr->num_of_specs && kern_attr_size > 0; i++) {
+		err = kern_spec_to_ib_spec(kern_spec, ib_spec);
+		if (err)
+			goto err_free;
+		flow_attr->size +=
+			((union ib_flow_spec *) ib_spec)->size;
+		kern_attr_size -= ((struct ib_kern_spec *) kern_spec)->size;
+		kern_spec += ((struct ib_kern_spec *) kern_spec)->size;
+		ib_spec += ((union ib_flow_spec *) ib_spec)->size;
+	}
+	if (kern_attr_size) {
+		pr_warn("create flow failed, %d bytes left from uverb cmd\n",
+			kern_attr_size);
+		goto err_free;
+	}
+	flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
+	if (IS_ERR(flow_id)) {
+		err = PTR_ERR(flow_id);
+		goto err_free;
+	}
+	flow_id->qp = qp;
+	flow_id->uobject = uobj;
+	uobj->object = flow_id;
+
+	err = idr_add_uobj(&ib_uverbs_rule_idr, uobj);
+	if (err)
+		goto destroy_flow;
+
+	memset(&resp, 0, sizeof(resp));
+	resp.flow_handle = uobj->id;
+
+	if (copy_to_user((void __user *)(unsigned long) cmd.response,
+			 &resp, sizeof(resp))) {
+		err = -EFAULT;
+		goto err_copy;
+	}
+
+	put_qp_read(qp);
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->rule_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+	kfree(flow_attr);
+	if (cmd.flow_attr.num_of_specs)
+		kfree(kern_flow_attr);
+	return in_len;
+err_copy:
+	idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+destroy_flow:
+	ib_destroy_flow(flow_id);
+err_free:
+	kfree(flow_attr);
+err_put:
+	put_qp_read(qp);
+err_uobj:
+	put_uobj_write(uobj);
+err_free_attr:
+	if (cmd.flow_attr.num_of_specs)
+		kfree(kern_flow_attr);
+	return err;
+}
+
+ssize_t ib_uverbs_destroy_flow(struct ib_uverbs_file *file,
+			       const char __user *buf, int in_len,
+			       int out_len) {
+	struct ib_uverbs_destroy_flow	cmd;
+	struct ib_flow			*flow_id;
+	struct ib_uobject		*uobj;
+	int				ret;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle,
+			      file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	flow_id = uobj->object;
+
+	ret = ib_destroy_flow(flow_id);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return ret ? ret : in_len;
+}
+
 static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
 				struct ib_uverbs_create_xsrq *cmd,
 				struct ib_udata *udata)
@@ -2860,6 +3098,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
 	struct ib_srq               	 *srq;
 	struct ib_uevent_object        	 *obj;
 	int                         	  ret = -EINVAL;
+	struct ib_usrq_object		 *us;
+	enum ib_srq_type		  srq_type;
 
 	if (copy_from_user(&cmd, buf, sizeof cmd))
 		return -EFAULT;
@@ -2869,6 +3109,7 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
 		return -EINVAL;
 	srq = uobj->object;
 	obj = container_of(uobj, struct ib_uevent_object, uobject);
+	srq_type = srq->srq_type;
 
 	ret = ib_destroy_srq(srq);
 	if (!ret)
@@ -2879,6 +3120,11 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
 	if (ret)
 		return ret;
 
+	if (srq_type == IB_SRQT_XRC) {
+		us = container_of(obj, struct ib_usrq_object, uevent);
+		atomic_dec(&us->uxrcd->refcnt);
+	}
+
 	idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
 
 	mutex_lock(&file->mutex);

+ 36 - 6
drivers/infiniband/core/uverbs_main.c

@@ -73,6 +73,7 @@ DEFINE_IDR(ib_uverbs_cq_idr);
 DEFINE_IDR(ib_uverbs_qp_idr);
 DEFINE_IDR(ib_uverbs_srq_idr);
 DEFINE_IDR(ib_uverbs_xrcd_idr);
+DEFINE_IDR(ib_uverbs_rule_idr);
 
 static DEFINE_SPINLOCK(map_lock);
 static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
@@ -113,7 +114,9 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
 	[IB_USER_VERBS_CMD_OPEN_XRCD]		= ib_uverbs_open_xrcd,
 	[IB_USER_VERBS_CMD_CLOSE_XRCD]		= ib_uverbs_close_xrcd,
 	[IB_USER_VERBS_CMD_CREATE_XSRQ]		= ib_uverbs_create_xsrq,
-	[IB_USER_VERBS_CMD_OPEN_QP]		= ib_uverbs_open_qp
+	[IB_USER_VERBS_CMD_OPEN_QP]		= ib_uverbs_open_qp,
+	[IB_USER_VERBS_CMD_CREATE_FLOW]		= ib_uverbs_create_flow,
+	[IB_USER_VERBS_CMD_DESTROY_FLOW]	= ib_uverbs_destroy_flow
 };
 
 static void ib_uverbs_add_one(struct ib_device *device);
@@ -212,6 +215,14 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
 		kfree(uobj);
 	}
 
+	list_for_each_entry_safe(uobj, tmp, &context->rule_list, list) {
+		struct ib_flow *flow_id = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+		ib_destroy_flow(flow_id);
+		kfree(uobj);
+	}
+
 	list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) {
 		struct ib_qp *qp = uobj->object;
 		struct ib_uqp_object *uqp =
@@ -583,9 +594,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 	if (copy_from_user(&hdr, buf, sizeof hdr))
 		return -EFAULT;
 
-	if (hdr.in_words * 4 != count)
-		return -EINVAL;
-
 	if (hdr.command >= ARRAY_SIZE(uverbs_cmd_table) ||
 	    !uverbs_cmd_table[hdr.command])
 		return -EINVAL;
@@ -597,8 +605,30 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
 	if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command)))
 		return -ENOSYS;
 
-	return uverbs_cmd_table[hdr.command](file, buf + sizeof hdr,
-					     hdr.in_words * 4, hdr.out_words * 4);
+	if (hdr.command >= IB_USER_VERBS_CMD_THRESHOLD) {
+		struct ib_uverbs_cmd_hdr_ex hdr_ex;
+
+		if (copy_from_user(&hdr_ex, buf, sizeof(hdr_ex)))
+			return -EFAULT;
+
+		if (((hdr_ex.in_words + hdr_ex.provider_in_words) * 4) != count)
+			return -EINVAL;
+
+		return uverbs_cmd_table[hdr.command](file,
+						     buf + sizeof(hdr_ex),
+						     (hdr_ex.in_words +
+						      hdr_ex.provider_in_words) * 4,
+						     (hdr_ex.out_words +
+						      hdr_ex.provider_out_words) * 4);
+	} else {
+		if (hdr.in_words * 4 != count)
+			return -EINVAL;
+
+		return uverbs_cmd_table[hdr.command](file,
+						     buf + sizeof(hdr),
+						     hdr.in_words * 4,
+						     hdr.out_words * 4);
+	}
 }
 
 static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)

+ 30 - 0
drivers/infiniband/core/verbs.c

@@ -346,10 +346,13 @@ EXPORT_SYMBOL(ib_destroy_srq);
 static void __ib_shared_qp_event_handler(struct ib_event *event, void *context)
 {
 	struct ib_qp *qp = context;
+	unsigned long flags;
 
+	spin_lock_irqsave(&qp->device->event_handler_lock, flags);
 	list_for_each_entry(event->element.qp, &qp->open_list, open_list)
 		if (event->element.qp->event_handler)
 			event->element.qp->event_handler(event, event->element.qp->qp_context);
+	spin_unlock_irqrestore(&qp->device->event_handler_lock, flags);
 }
 
 static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp)
@@ -1254,3 +1257,30 @@ int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
 	return xrcd->device->dealloc_xrcd(xrcd);
 }
 EXPORT_SYMBOL(ib_dealloc_xrcd);
+
+struct ib_flow *ib_create_flow(struct ib_qp *qp,
+			       struct ib_flow_attr *flow_attr,
+			       int domain)
+{
+	struct ib_flow *flow_id;
+	if (!qp->device->create_flow)
+		return ERR_PTR(-ENOSYS);
+
+	flow_id = qp->device->create_flow(qp, flow_attr, domain);
+	if (!IS_ERR(flow_id))
+		atomic_inc(&qp->usecnt);
+	return flow_id;
+}
+EXPORT_SYMBOL(ib_create_flow);
+
+int ib_destroy_flow(struct ib_flow *flow_id)
+{
+	int err;
+	struct ib_qp *qp = flow_id->qp;
+
+	err = qp->device->destroy_flow(flow_id);
+	if (!err)
+		atomic_dec(&qp->usecnt);
+	return err;
+}
+EXPORT_SYMBOL(ib_destroy_flow);

+ 235 - 0
drivers/infiniband/hw/mlx4/main.c

@@ -54,6 +54,8 @@
 #define DRV_VERSION	"1.0"
 #define DRV_RELDATE	"April 4, 2008"
 
+#define MLX4_IB_FLOW_MAX_PRIO 0xFFF
+
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
 MODULE_LICENSE("Dual BSD/GPL");
@@ -88,6 +90,25 @@ static void init_query_mad(struct ib_smp *mad)
 
 static union ib_gid zgid;
 
+static int check_flow_steering_support(struct mlx4_dev *dev)
+{
+	int ib_num_ports = 0;
+	int i;
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		ib_num_ports++;
+
+	if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		if (ib_num_ports || mlx4_is_mfunc(dev)) {
+			pr_warn("Device managed flow steering is unavailable "
+				"for IB ports or in multifunction env.\n");
+			return 0;
+		}
+		return 1;
+	}
+	return 0;
+}
+
 static int mlx4_ib_query_device(struct ib_device *ibdev,
 				struct ib_device_attr *props)
 {
@@ -144,6 +165,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 			props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B;
 		else
 			props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A;
+	if (check_flow_steering_support(dev->dev))
+		props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
 	}
 
 	props->vendor_id	   = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
@@ -798,6 +821,209 @@ struct mlx4_ib_steering {
 	union ib_gid gid;
 };
 
+static int parse_flow_attr(struct mlx4_dev *dev,
+			   union ib_flow_spec *ib_spec,
+			   struct _rule_hw *mlx4_spec)
+{
+	enum mlx4_net_trans_rule_id type;
+
+	switch (ib_spec->type) {
+	case IB_FLOW_SPEC_ETH:
+		type = MLX4_NET_TRANS_RULE_ID_ETH;
+		memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac,
+		       ETH_ALEN);
+		memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac,
+		       ETH_ALEN);
+		mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag;
+		mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag;
+		break;
+
+	case IB_FLOW_SPEC_IPV4:
+		type = MLX4_NET_TRANS_RULE_ID_IPV4;
+		mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip;
+		mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip;
+		mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip;
+		mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip;
+		break;
+
+	case IB_FLOW_SPEC_TCP:
+	case IB_FLOW_SPEC_UDP:
+		type = ib_spec->type == IB_FLOW_SPEC_TCP ?
+					MLX4_NET_TRANS_RULE_ID_TCP :
+					MLX4_NET_TRANS_RULE_ID_UDP;
+		mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port;
+		mlx4_spec->tcp_udp.dst_port_msk = ib_spec->tcp_udp.mask.dst_port;
+		mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port;
+		mlx4_spec->tcp_udp.src_port_msk = ib_spec->tcp_udp.mask.src_port;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	if (mlx4_map_sw_to_hw_steering_id(dev, type) < 0 ||
+	    mlx4_hw_rule_sz(dev, type) < 0)
+		return -EINVAL;
+	mlx4_spec->id = cpu_to_be16(mlx4_map_sw_to_hw_steering_id(dev, type));
+	mlx4_spec->size = mlx4_hw_rule_sz(dev, type) >> 2;
+	return mlx4_hw_rule_sz(dev, type);
+}
+
+static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
+			  int domain,
+			  enum mlx4_net_trans_promisc_mode flow_type,
+			  u64 *reg_id)
+{
+	int ret, i;
+	int size = 0;
+	void *ib_flow;
+	struct mlx4_ib_dev *mdev = to_mdev(qp->device);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_net_trans_rule_hw_ctrl *ctrl;
+	size_t rule_size = sizeof(struct mlx4_net_trans_rule_hw_ctrl) +
+			   (sizeof(struct _rule_hw) * flow_attr->num_of_specs);
+
+	static const u16 __mlx4_domain[] = {
+		[IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS,
+		[IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL,
+		[IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS,
+		[IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC,
+	};
+
+	if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) {
+		pr_err("Invalid priority value %d\n", flow_attr->priority);
+		return -EINVAL;
+	}
+
+	if (domain >= IB_FLOW_DOMAIN_NUM) {
+		pr_err("Invalid domain value %d\n", domain);
+		return -EINVAL;
+	}
+
+	if (mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0)
+		return -EINVAL;
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	memset(mailbox->buf, 0, rule_size);
+	ctrl = mailbox->buf;
+
+	ctrl->prio = cpu_to_be16(__mlx4_domain[domain] |
+				 flow_attr->priority);
+	ctrl->type = mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type);
+	ctrl->port = flow_attr->port;
+	ctrl->qpn = cpu_to_be32(qp->qp_num);
+
+	ib_flow = flow_attr + 1;
+	size += sizeof(struct mlx4_net_trans_rule_hw_ctrl);
+	for (i = 0; i < flow_attr->num_of_specs; i++) {
+		ret = parse_flow_attr(mdev->dev, ib_flow, mailbox->buf + size);
+		if (ret < 0) {
+			mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+			return -EINVAL;
+		}
+		ib_flow += ((union ib_flow_spec *) ib_flow)->size;
+		size += ret;
+	}
+
+	ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (ret == -ENOMEM)
+		pr_err("mcg table is full. Fail to register network rule.\n");
+	else if (ret == -ENXIO)
+		pr_err("Device managed flow steering is disabled. Fail to register network rule.\n");
+	else if (ret)
+		pr_err("Invalid argumant. Fail to register network rule.\n");
+
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return ret;
+}
+
+static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id)
+{
+	int err;
+	err = mlx4_cmd(dev, reg_id, 0, 0,
+		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+	if (err)
+		pr_err("Fail to detach network rule. registration id = 0x%llx\n",
+		       reg_id);
+	return err;
+}
+
+static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,
+				    struct ib_flow_attr *flow_attr,
+				    int domain)
+{
+	int err = 0, i = 0;
+	struct mlx4_ib_flow *mflow;
+	enum mlx4_net_trans_promisc_mode type[2];
+
+	memset(type, 0, sizeof(type));
+
+	mflow = kzalloc(sizeof(*mflow), GFP_KERNEL);
+	if (!mflow) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	switch (flow_attr->type) {
+	case IB_FLOW_ATTR_NORMAL:
+		type[0] = MLX4_FS_REGULAR;
+		break;
+
+	case IB_FLOW_ATTR_ALL_DEFAULT:
+		type[0] = MLX4_FS_ALL_DEFAULT;
+		break;
+
+	case IB_FLOW_ATTR_MC_DEFAULT:
+		type[0] = MLX4_FS_MC_DEFAULT;
+		break;
+
+	case IB_FLOW_ATTR_SNIFFER:
+		type[0] = MLX4_FS_UC_SNIFFER;
+		type[1] = MLX4_FS_MC_SNIFFER;
+		break;
+
+	default:
+		err = -EINVAL;
+		goto err_free;
+	}
+
+	while (i < ARRAY_SIZE(type) && type[i]) {
+		err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i],
+					    &mflow->reg_id[i]);
+		if (err)
+			goto err_free;
+		i++;
+	}
+
+	return &mflow->ibflow;
+
+err_free:
+	kfree(mflow);
+	return ERR_PTR(err);
+}
+
+static int mlx4_ib_destroy_flow(struct ib_flow *flow_id)
+{
+	int err, ret = 0;
+	int i = 0;
+	struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device);
+	struct mlx4_ib_flow *mflow = to_mflow(flow_id);
+
+	while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i]) {
+		err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i]);
+		if (err)
+			ret = err;
+		i++;
+	}
+
+	kfree(mflow);
+	return ret;
+}
+
 static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 {
 	int err;
@@ -1461,6 +1687,15 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
 	}
 
+	if (check_flow_steering_support(dev)) {
+		ibdev->ib_dev.create_flow	= mlx4_ib_create_flow;
+		ibdev->ib_dev.destroy_flow	= mlx4_ib_destroy_flow;
+
+		ibdev->ib_dev.uverbs_cmd_mask	|=
+			(1ull << IB_USER_VERBS_CMD_CREATE_FLOW) |
+			(1ull << IB_USER_VERBS_CMD_DESTROY_FLOW);
+	}
+
 	mlx4_ib_alloc_eqs(dev, ibdev);
 
 	spin_lock_init(&iboe->lock);

+ 12 - 0
drivers/infiniband/hw/mlx4/mlx4_ib.h

@@ -132,6 +132,12 @@ struct mlx4_ib_fmr {
 	struct mlx4_fmr         mfmr;
 };
 
+struct mlx4_ib_flow {
+	struct ib_flow ibflow;
+	/* translating DMFS verbs sniffer rule to FW API requires two reg IDs */
+	u64 reg_id[2];
+};
+
 struct mlx4_ib_wq {
 	u64		       *wrid;
 	spinlock_t		lock;
@@ -552,6 +558,12 @@ static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr)
 {
 	return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr);
 }
+
+static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow)
+{
+	return container_of(ibflow, struct mlx4_ib_flow, ibflow);
+}
+
 static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp)
 {
 	return container_of(ibqp, struct mlx4_ib_qp, ibqp);

+ 12 - 10
drivers/infiniband/hw/ocrdma/ocrdma.h

@@ -56,10 +56,12 @@ struct ocrdma_dev_attr {
 	u16 max_qp;
 	u16 max_wqe;
 	u16 max_rqe;
+	u16 max_srq;
 	u32 max_inline_data;
 	int max_send_sge;
 	int max_recv_sge;
 	int max_srq_sge;
+	int max_rdma_sge;
 	int max_mr;
 	u64 max_mr_size;
 	u32 max_num_mr_pbl;
@@ -130,8 +132,7 @@ struct ocrdma_dev {
 	struct ocrdma_cq **cq_tbl;
 	struct ocrdma_qp **qp_tbl;
 
-	struct ocrdma_eq meq;
-	struct ocrdma_eq *qp_eq_tbl;
+	struct ocrdma_eq *eq_tbl;
 	int eq_cnt;
 	u16 base_eqid;
 	u16 max_eq;
@@ -168,11 +169,12 @@ struct ocrdma_dev {
 	struct list_head entry;
 	struct rcu_head rcu;
 	int id;
+	u64 stag_arr[OCRDMA_MAX_STAG];
+	u16 pvid;
 };
 
 struct ocrdma_cq {
 	struct ib_cq ibcq;
-	struct ocrdma_dev *dev;
 	struct ocrdma_cqe *va;
 	u32 phase;
 	u32 getp;	/* pointer to pending wrs to
@@ -214,7 +216,6 @@ struct ocrdma_pd {
 
 struct ocrdma_ah {
 	struct ib_ah ibah;
-	struct ocrdma_dev *dev;
 	struct ocrdma_av *av;
 	u16 sgid_index;
 	u32 id;
@@ -234,7 +235,6 @@ struct ocrdma_qp_hwq_info {
 
 struct ocrdma_srq {
 	struct ib_srq ibsrq;
-	struct ocrdma_dev *dev;
 	u8 __iomem *db;
 	struct ocrdma_qp_hwq_info rq;
 	u64 *rqe_wr_id_tbl;
@@ -290,10 +290,11 @@ struct ocrdma_qp {
 	u32 qkey;
 	bool dpp_enabled;
 	u8 *ird_q_va;
+	bool signaled;
+	u16 db_cache;
 };
 
 struct ocrdma_hw_mr {
-	struct ocrdma_dev *dev;
 	u32 lkey;
 	u8 fr_mr;
 	u8 remote_atomic;
@@ -317,15 +318,16 @@ struct ocrdma_mr {
 	struct ib_mr ibmr;
 	struct ib_umem *umem;
 	struct ocrdma_hw_mr hwmr;
-	struct ocrdma_pd *pd;
 };
 
 struct ocrdma_ucontext {
 	struct ib_ucontext ibucontext;
-	struct ocrdma_dev *dev;
 
 	struct list_head mm_head;
 	struct mutex mm_list_lock; /* protects list entries of mm type */
+	struct ocrdma_pd *cntxt_pd;
+	int pd_in_use;
+
 	struct {
 		u32 *va;
 		dma_addr_t pa;
@@ -386,14 +388,14 @@ static inline struct ocrdma_srq *get_ocrdma_srq(struct ib_srq *ibsrq)
 static inline int ocrdma_get_num_posted_shift(struct ocrdma_qp *qp)
 {
 	return ((qp->dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY &&
-		 qp->id < 64) ? 24 : 16);
+		 qp->id < 128) ? 24 : 16);
 }
 
 static inline int is_cqe_valid(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe)
 {
 	int cqe_valid;
 	cqe_valid = le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_VALID;
-	return ((cqe_valid == cq->phase) ? 1 : 0);
+	return (cqe_valid == cq->phase);
 }
 
 static inline int is_cqe_for_sq(struct ocrdma_cqe *cqe)

+ 18 - 14
drivers/infiniband/hw/ocrdma/ocrdma_abi.h

@@ -28,6 +28,9 @@
 #ifndef __OCRDMA_ABI_H__
 #define __OCRDMA_ABI_H__
 
+#define OCRDMA_ABI_VERSION 1
+/* user kernel communication data structures. */
+
 struct ocrdma_alloc_ucontext_resp {
 	u32 dev_id;
 	u32 wqe_size;
@@ -35,16 +38,16 @@ struct ocrdma_alloc_ucontext_resp {
 	u32 dpp_wqe_size;
 	u64 ah_tbl_page;
 	u32 ah_tbl_len;
-	u32 rsvd;
-	u8 fw_ver[32];
 	u32 rqe_size;
+	u8 fw_ver[32];
+	/* for future use/new features in progress */
 	u64 rsvd1;
-} __packed;
+	u64 rsvd2;
+};
 
-/* user kernel communication data structures. */
 struct ocrdma_alloc_pd_ureq {
 	u64 rsvd1;
-} __packed;
+};
 
 struct ocrdma_alloc_pd_uresp {
 	u32 id;
@@ -52,12 +55,12 @@ struct ocrdma_alloc_pd_uresp {
 	u32 dpp_page_addr_hi;
 	u32 dpp_page_addr_lo;
 	u64 rsvd1;
-} __packed;
+};
 
 struct ocrdma_create_cq_ureq {
 	u32 dpp_cq;
-	u32 rsvd;
-} __packed;
+	u32 rsvd; /* pad */
+};
 
 #define MAX_CQ_PAGES 8
 struct ocrdma_create_cq_uresp {
@@ -69,9 +72,10 @@ struct ocrdma_create_cq_uresp {
 	u64 db_page_addr;
 	u32 db_page_size;
 	u32 phase_change;
+	/* for future use/new features in progress */
 	u64 rsvd1;
 	u64 rsvd2;
-} __packed;
+};
 
 #define MAX_QP_PAGES 8
 #define MAX_UD_AV_PAGES 8
@@ -80,14 +84,14 @@ struct ocrdma_create_qp_ureq {
 	u8 enable_dpp_cq;
 	u8 rsvd;
 	u16 dpp_cq_id;
-	u32 rsvd1;
+	u32 rsvd1;	/* pad */
 };
 
 struct ocrdma_create_qp_uresp {
 	u16 qp_id;
 	u16 sq_dbid;
 	u16 rq_dbid;
-	u16 resv0;
+	u16 resv0;	/* pad */
 	u32 sq_page_size;
 	u32 rq_page_size;
 	u32 num_sq_pages;
@@ -98,19 +102,19 @@ struct ocrdma_create_qp_uresp {
 	u32 db_page_size;
 	u32 dpp_credit;
 	u32 dpp_offset;
-	u32 rsvd1;
 	u32 num_wqe_allocated;
 	u32 num_rqe_allocated;
 	u32 db_sq_offset;
 	u32 db_rq_offset;
 	u32 db_shift;
+	u64 rsvd1;
 	u64 rsvd2;
 	u64 rsvd3;
 } __packed;
 
 struct ocrdma_create_srq_uresp {
 	u16 rq_dbid;
-	u16 resv0;
+	u16 resv0;	/* pad */
 	u32 resv1;
 
 	u32 rq_page_size;
@@ -126,6 +130,6 @@ struct ocrdma_create_srq_uresp {
 
 	u64 rsvd2;
 	u64 rsvd3;
-} __packed;
+};
 
 #endif				/* __OCRDMA_ABI_H__ */

+ 8 - 6
drivers/infiniband/hw/ocrdma/ocrdma_ah.c

@@ -35,12 +35,11 @@
 #include "ocrdma_ah.h"
 #include "ocrdma_hw.h"
 
-static inline int set_av_attr(struct ocrdma_ah *ah,
+static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
 				struct ib_ah_attr *attr, int pdid)
 {
 	int status = 0;
 	u16 vlan_tag; bool vlan_enabled = false;
-	struct ocrdma_dev *dev = ah->dev;
 	struct ocrdma_eth_vlan eth;
 	struct ocrdma_grh grh;
 	int eth_sz;
@@ -51,6 +50,8 @@ static inline int set_av_attr(struct ocrdma_ah *ah,
 	ah->sgid_index = attr->grh.sgid_index;
 
 	vlan_tag = rdma_get_vlan_id(&attr->grh.dgid);
+	if (!vlan_tag || (vlan_tag > 0xFFF))
+		vlan_tag = dev->pvid;
 	if (vlan_tag && (vlan_tag < 0x1000)) {
 		eth.eth_type = cpu_to_be16(0x8100);
 		eth.roce_eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE);
@@ -92,7 +93,7 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
 	int status;
 	struct ocrdma_ah *ah;
 	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
-	struct ocrdma_dev *dev = pd->dev;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
 
 	if (!(attr->ah_flags & IB_AH_GRH))
 		return ERR_PTR(-EINVAL);
@@ -100,12 +101,11 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
 	ah = kzalloc(sizeof *ah, GFP_ATOMIC);
 	if (!ah)
 		return ERR_PTR(-ENOMEM);
-	ah->dev = pd->dev;
 
 	status = ocrdma_alloc_av(dev, ah);
 	if (status)
 		goto av_err;
-	status = set_av_attr(ah, attr, pd->id);
+	status = set_av_attr(dev, ah, attr, pd->id);
 	if (status)
 		goto av_conf_err;
 
@@ -126,7 +126,9 @@ av_err:
 int ocrdma_destroy_ah(struct ib_ah *ibah)
 {
 	struct ocrdma_ah *ah = get_ocrdma_ah(ibah);
-	ocrdma_free_av(ah->dev, ah);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibah->device);
+
+	ocrdma_free_av(dev, ah);
 	kfree(ah);
 	return 0;
 }

+ 207 - 251
drivers/infiniband/hw/ocrdma/ocrdma_hw.c

@@ -94,7 +94,7 @@ enum cqe_status {
 
 static inline void *ocrdma_get_eqe(struct ocrdma_eq *eq)
 {
-	return (u8 *)eq->q.va + (eq->q.tail * sizeof(struct ocrdma_eqe));
+	return eq->q.va + (eq->q.tail * sizeof(struct ocrdma_eqe));
 }
 
 static inline void ocrdma_eq_inc_tail(struct ocrdma_eq *eq)
@@ -105,8 +105,7 @@ static inline void ocrdma_eq_inc_tail(struct ocrdma_eq *eq)
 static inline void *ocrdma_get_mcqe(struct ocrdma_dev *dev)
 {
 	struct ocrdma_mcqe *cqe = (struct ocrdma_mcqe *)
-	    ((u8 *) dev->mq.cq.va +
-	     (dev->mq.cq.tail * sizeof(struct ocrdma_mcqe)));
+	    (dev->mq.cq.va + (dev->mq.cq.tail * sizeof(struct ocrdma_mcqe)));
 
 	if (!(le32_to_cpu(cqe->valid_ae_cmpl_cons) & OCRDMA_MCQE_VALID_MASK))
 		return NULL;
@@ -120,9 +119,7 @@ static inline void ocrdma_mcq_inc_tail(struct ocrdma_dev *dev)
 
 static inline struct ocrdma_mqe *ocrdma_get_mqe(struct ocrdma_dev *dev)
 {
-	return (struct ocrdma_mqe *)((u8 *) dev->mq.sq.va +
-				     (dev->mq.sq.head *
-				      sizeof(struct ocrdma_mqe)));
+	return dev->mq.sq.va + (dev->mq.sq.head * sizeof(struct ocrdma_mqe));
 }
 
 static inline void ocrdma_mq_inc_head(struct ocrdma_dev *dev)
@@ -132,8 +129,7 @@ static inline void ocrdma_mq_inc_head(struct ocrdma_dev *dev)
 
 static inline void *ocrdma_get_mqe_rsp(struct ocrdma_dev *dev)
 {
-	return (void *)((u8 *) dev->mq.sq.va +
-			(dev->mqe_ctx.tag * sizeof(struct ocrdma_mqe)));
+	return dev->mq.sq.va + (dev->mqe_ctx.tag * sizeof(struct ocrdma_mqe));
 }
 
 enum ib_qp_state get_ibqp_state(enum ocrdma_qp_state qps)
@@ -181,7 +177,7 @@ static enum ocrdma_qp_state get_ocrdma_qp_state(enum ib_qp_state qps)
 
 static int ocrdma_get_mbx_errno(u32 status)
 {
-	int err_num = -EFAULT;
+	int err_num;
 	u8 mbox_status = (status & OCRDMA_MBX_RSP_STATUS_MASK) >>
 					OCRDMA_MBX_RSP_STATUS_SHIFT;
 	u8 add_status = (status & OCRDMA_MBX_RSP_ASTATUS_MASK) >>
@@ -260,10 +256,11 @@ static int ocrdma_get_mbx_cqe_errno(u16 cqe_status)
 		break;
 	case OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_RESOURCES:
 	case OCRDMA_MBX_CQE_STATUS_QUEUE_FLUSHING:
-		err_num = -EAGAIN;
+		err_num = -EINVAL;
 		break;
 	case OCRDMA_MBX_CQE_STATUS_DMA_FAILED:
-		err_num = -EIO;
+	default:
+		err_num = -EINVAL;
 		break;
 	}
 	return err_num;
@@ -367,22 +364,6 @@ static void ocrdma_build_q_pages(struct ocrdma_pa *q_pa, int cnt,
 	}
 }
 
-static void ocrdma_assign_eq_vect_gen2(struct ocrdma_dev *dev,
-				       struct ocrdma_eq *eq)
-{
-	/* assign vector and update vector id for next EQ */
-	eq->vector = dev->nic_info.msix.start_vector;
-	dev->nic_info.msix.start_vector += 1;
-}
-
-static void ocrdma_free_eq_vect_gen2(struct ocrdma_dev *dev)
-{
-	/* this assumes that EQs are freed in exactly reverse order
-	 * as its allocation.
-	 */
-	dev->nic_info.msix.start_vector -= 1;
-}
-
 static int ocrdma_mbx_delete_q(struct ocrdma_dev *dev, struct ocrdma_queue_info *q,
 			       int queue_type)
 {
@@ -423,11 +404,8 @@ static int ocrdma_mbx_create_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
 	memset(cmd, 0, sizeof(*cmd));
 	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_EQ, OCRDMA_SUBSYS_COMMON,
 			sizeof(*cmd));
-	if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY)
-		cmd->req.rsvd_version = 0;
-	else
-		cmd->req.rsvd_version = 2;
 
+	cmd->req.rsvd_version = 2;
 	cmd->num_pages = 4;
 	cmd->valid = OCRDMA_CREATE_EQ_VALID;
 	cmd->cnt = 4 << OCRDMA_CREATE_EQ_CNT_SHIFT;
@@ -438,12 +416,7 @@ static int ocrdma_mbx_create_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
 				 NULL);
 	if (!status) {
 		eq->q.id = rsp->vector_eqid & 0xffff;
-		if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY)
-			ocrdma_assign_eq_vect_gen2(dev, eq);
-		else {
-			eq->vector = (rsp->vector_eqid >> 16) & 0xffff;
-			dev->nic_info.msix.start_vector += 1;
-		}
+		eq->vector = (rsp->vector_eqid >> 16) & 0xffff;
 		eq->q.created = true;
 	}
 	return status;
@@ -486,8 +459,6 @@ static void _ocrdma_destroy_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
 {
 	if (eq->q.created) {
 		ocrdma_mbx_delete_q(dev, &eq->q, QTYPE_EQ);
-		if (dev->nic_info.dev_family == OCRDMA_GEN2_FAMILY)
-			ocrdma_free_eq_vect_gen2(dev);
 		ocrdma_free_q(dev, &eq->q);
 	}
 }
@@ -506,13 +477,12 @@ static void ocrdma_destroy_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
 	_ocrdma_destroy_eq(dev, eq);
 }
 
-static void ocrdma_destroy_qp_eqs(struct ocrdma_dev *dev)
+static void ocrdma_destroy_eqs(struct ocrdma_dev *dev)
 {
 	int i;
 
-	/* deallocate the data path eqs */
 	for (i = 0; i < dev->eq_cnt; i++)
-		ocrdma_destroy_eq(dev, &dev->qp_eq_tbl[i]);
+		ocrdma_destroy_eq(dev, &dev->eq_tbl[i]);
 }
 
 static int ocrdma_mbx_mq_cq_create(struct ocrdma_dev *dev,
@@ -527,16 +497,21 @@ static int ocrdma_mbx_mq_cq_create(struct ocrdma_dev *dev,
 	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_CQ,
 			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
 
-	cmd->pgsz_pgcnt = PAGES_4K_SPANNED(cq->va, cq->size);
+	cmd->req.rsvd_version = OCRDMA_CREATE_CQ_VER2;
+	cmd->pgsz_pgcnt = (cq->size / OCRDMA_MIN_Q_PAGE_SIZE) <<
+		OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT;
+	cmd->pgsz_pgcnt |= PAGES_4K_SPANNED(cq->va, cq->size);
+
 	cmd->ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS;
-	cmd->eqn = (eq->id << OCRDMA_CREATE_CQ_EQID_SHIFT);
+	cmd->eqn = eq->id;
+	cmd->cqe_count = cq->size / sizeof(struct ocrdma_mcqe);
 
-	ocrdma_build_q_pages(&cmd->pa[0], cmd->pgsz_pgcnt,
+	ocrdma_build_q_pages(&cmd->pa[0], cq->size / OCRDMA_MIN_Q_PAGE_SIZE,
 			     cq->dma, PAGE_SIZE_4K);
 	status = be_roce_mcc_cmd(dev->nic_info.netdev,
 				 cmd, sizeof(*cmd), NULL, NULL);
 	if (!status) {
-		cq->id = (rsp->cq_id & OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK);
+		cq->id = (u16) (rsp->cq_id & OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK);
 		cq->created = true;
 	}
 	return status;
@@ -569,7 +544,10 @@ static int ocrdma_mbx_create_mq(struct ocrdma_dev *dev,
 	cmd->cqid_pages = num_pages;
 	cmd->cqid_pages |= (cq->id << OCRDMA_CREATE_MQ_CQ_ID_SHIFT);
 	cmd->async_cqid_valid = OCRDMA_CREATE_MQ_ASYNC_CQ_VALID;
-	cmd->async_event_bitmap = Bit(20);
+
+	cmd->async_event_bitmap = Bit(OCRDMA_ASYNC_GRP5_EVE_CODE);
+	cmd->async_event_bitmap |= Bit(OCRDMA_ASYNC_RDMA_EVE_CODE);
+
 	cmd->async_cqid_ringsize = cq->id;
 	cmd->async_cqid_ringsize |= (ocrdma_encoded_q_len(mq->len) <<
 				OCRDMA_CREATE_MQ_RING_SIZE_SHIFT);
@@ -596,7 +574,7 @@ static int ocrdma_create_mq(struct ocrdma_dev *dev)
 	if (status)
 		goto alloc_err;
 
-	status = ocrdma_mbx_mq_cq_create(dev, &dev->mq.cq, &dev->meq.q);
+	status = ocrdma_mbx_mq_cq_create(dev, &dev->mq.cq, &dev->eq_tbl[0].q);
 	if (status)
 		goto mbx_cq_free;
 
@@ -653,7 +631,7 @@ static void ocrdma_process_qpcat_error(struct ocrdma_dev *dev,
 
 	if (qp == NULL)
 		BUG();
-	ocrdma_qp_state_machine(qp, new_ib_qps, &old_ib_qps);
+	ocrdma_qp_state_change(qp, new_ib_qps, &old_ib_qps);
 }
 
 static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev,
@@ -746,11 +724,35 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev,
 			qp->srq->ibsrq.event_handler(&ib_evt,
 						     qp->srq->ibsrq.
 						     srq_context);
-	} else if (dev_event)
+	} else if (dev_event) {
 		ib_dispatch_event(&ib_evt);
+	}
 
 }
 
+static void ocrdma_process_grp5_aync(struct ocrdma_dev *dev,
+					struct ocrdma_ae_mcqe *cqe)
+{
+	struct ocrdma_ae_pvid_mcqe *evt;
+	int type = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_TYPE_MASK) >>
+			OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT;
+
+	switch (type) {
+	case OCRDMA_ASYNC_EVENT_PVID_STATE:
+		evt = (struct ocrdma_ae_pvid_mcqe *)cqe;
+		if ((evt->tag_enabled & OCRDMA_AE_PVID_MCQE_ENABLED_MASK) >>
+			OCRDMA_AE_PVID_MCQE_ENABLED_SHIFT)
+			dev->pvid = ((evt->tag_enabled &
+					OCRDMA_AE_PVID_MCQE_TAG_MASK) >>
+					OCRDMA_AE_PVID_MCQE_TAG_SHIFT);
+		break;
+	default:
+		/* Not interested evts. */
+		break;
+	}
+}
+
+
 static void ocrdma_process_acqe(struct ocrdma_dev *dev, void *ae_cqe)
 {
 	/* async CQE processing */
@@ -758,8 +760,10 @@ static void ocrdma_process_acqe(struct ocrdma_dev *dev, void *ae_cqe)
 	u32 evt_code = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_CODE_MASK) >>
 			OCRDMA_AE_MCQE_EVENT_CODE_SHIFT;
 
-	if (evt_code == OCRDMA_ASYNC_EVE_CODE)
+	if (evt_code == OCRDMA_ASYNC_RDMA_EVE_CODE)
 		ocrdma_dispatch_ibevent(dev, cqe);
+	else if (evt_code == OCRDMA_ASYNC_GRP5_EVE_CODE)
+		ocrdma_process_grp5_aync(dev, cqe);
 	else
 		pr_err("%s(%d) invalid evt code=0x%x\n", __func__,
 		       dev->id, evt_code);
@@ -957,9 +961,8 @@ static int ocrdma_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe)
 	rsp = ocrdma_get_mqe_rsp(dev);
 	ocrdma_copy_le32_to_cpu(mqe, rsp, (sizeof(*mqe)));
 	if (cqe_status || ext_status) {
-		pr_err
-		    ("%s() opcode=0x%x, cqe_status=0x%x, ext_status=0x%x\n",
-		     __func__,
+		pr_err("%s() opcode=0x%x, cqe_status=0x%x, ext_status=0x%x\n",
+		       __func__,
 		     (rsp->u.rsp.subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >>
 		     OCRDMA_MBX_RSP_OPCODE_SHIFT, cqe_status, ext_status);
 		status = ocrdma_get_mbx_cqe_errno(cqe_status);
@@ -991,9 +994,15 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev,
 	attr->max_srq_sge = (rsp->max_srq_rqe_sge &
 			      OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_MASK) >>
 	    OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET;
+	attr->max_rdma_sge = (rsp->max_write_send_sge &
+			      OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT;
 	attr->max_ord_per_qp = (rsp->max_ird_ord_per_qp &
 				OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK) >>
 	    OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT;
+	attr->max_srq =
+		(rsp->max_srq_rpir_qps & OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK) >>
+		OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET;
 	attr->max_ird_per_qp = (rsp->max_ird_ord_per_qp &
 				OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_MASK) >>
 	    OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT;
@@ -1013,6 +1022,9 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev,
 	attr->max_num_mr_pbl = rsp->max_num_mr_pbl;
 	attr->max_cqe = rsp->max_cq_cqes_per_cq &
 			OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_MASK;
+	attr->max_cq = (rsp->max_cq_cqes_per_cq &
+			OCRDMA_MBX_QUERY_CFG_MAX_CQ_MASK) >>
+			OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET;
 	attr->wqe_size = ((rsp->wqe_rqe_stride_max_dpp_cqs &
 		OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_MASK) >>
 		OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET) *
@@ -1045,7 +1057,6 @@ static int ocrdma_check_fw_config(struct ocrdma_dev *dev,
 		return -EINVAL;
 	dev->base_eqid = conf->base_eqid;
 	dev->max_eq = conf->max_eq;
-	dev->attr.max_cq = OCRDMA_MAX_CQ - 1;
 	return 0;
 }
 
@@ -1118,6 +1129,34 @@ mbx_err:
 	return status;
 }
 
+int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed)
+{
+	int status = -ENOMEM;
+	struct ocrdma_get_link_speed_rsp *rsp;
+	struct ocrdma_mqe *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1,
+				  sizeof(*cmd));
+	if (!cmd)
+		return status;
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0],
+			OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+
+	((struct ocrdma_mbx_hdr *)cmd->u.cmd)->rsvd_version = 0x1;
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+	rsp = (struct ocrdma_get_link_speed_rsp *)cmd;
+	*lnk_speed = rsp->phys_port_speed;
+
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
 int ocrdma_mbx_alloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd)
 {
 	int status = -ENOMEM;
@@ -1296,19 +1335,19 @@ static u16 ocrdma_bind_eq(struct ocrdma_dev *dev)
 	u16 eq_id;
 
 	mutex_lock(&dev->dev_lock);
-	cq_cnt = dev->qp_eq_tbl[0].cq_cnt;
-	eq_id = dev->qp_eq_tbl[0].q.id;
+	cq_cnt = dev->eq_tbl[0].cq_cnt;
+	eq_id = dev->eq_tbl[0].q.id;
 	/* find the EQ which is has the least number of
 	 * CQs associated with it.
 	 */
 	for (i = 0; i < dev->eq_cnt; i++) {
-		if (dev->qp_eq_tbl[i].cq_cnt < cq_cnt) {
-			cq_cnt = dev->qp_eq_tbl[i].cq_cnt;
-			eq_id = dev->qp_eq_tbl[i].q.id;
+		if (dev->eq_tbl[i].cq_cnt < cq_cnt) {
+			cq_cnt = dev->eq_tbl[i].cq_cnt;
+			eq_id = dev->eq_tbl[i].q.id;
 			selected_eq = i;
 		}
 	}
-	dev->qp_eq_tbl[selected_eq].cq_cnt += 1;
+	dev->eq_tbl[selected_eq].cq_cnt += 1;
 	mutex_unlock(&dev->dev_lock);
 	return eq_id;
 }
@@ -1319,16 +1358,16 @@ static void ocrdma_unbind_eq(struct ocrdma_dev *dev, u16 eq_id)
 
 	mutex_lock(&dev->dev_lock);
 	for (i = 0; i < dev->eq_cnt; i++) {
-		if (dev->qp_eq_tbl[i].q.id != eq_id)
+		if (dev->eq_tbl[i].q.id != eq_id)
 			continue;
-		dev->qp_eq_tbl[i].cq_cnt -= 1;
+		dev->eq_tbl[i].cq_cnt -= 1;
 		break;
 	}
 	mutex_unlock(&dev->dev_lock);
 }
 
 int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq,
-			 int entries, int dpp_cq)
+			 int entries, int dpp_cq, u16 pd_id)
 {
 	int status = -ENOMEM; int max_hw_cqe;
 	struct pci_dev *pdev = dev->nic_info.pdev;
@@ -1336,8 +1375,6 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq,
 	struct ocrdma_create_cq_rsp *rsp;
 	u32 hw_pages, cqe_size, page_size, cqe_count;
 
-	if (dpp_cq)
-		return -EINVAL;
 	if (entries > dev->attr.max_cqe) {
 		pr_err("%s(%d) max_cqe=0x%x, requester_cqe=0x%x\n",
 		       __func__, dev->id, dev->attr.max_cqe, entries);
@@ -1377,15 +1414,13 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq,
 	cmd->cmd.pgsz_pgcnt |= hw_pages;
 	cmd->cmd.ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS;
 
-	if (dev->eq_cnt < 0)
-		goto eq_err;
 	cq->eqn = ocrdma_bind_eq(dev);
-	cmd->cmd.req.rsvd_version = OCRDMA_CREATE_CQ_VER2;
+	cmd->cmd.req.rsvd_version = OCRDMA_CREATE_CQ_VER3;
 	cqe_count = cq->len / cqe_size;
-	if (cqe_count > 1024)
+	if (cqe_count > 1024) {
 		/* Set cnt to 3 to indicate more than 1024 cq entries */
 		cmd->cmd.ev_cnt_flags |= (0x3 << OCRDMA_CREATE_CQ_CNT_SHIFT);
-	else {
+	} else {
 		u8 count = 0;
 		switch (cqe_count) {
 		case 256:
@@ -1416,6 +1451,7 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq,
 		cq->phase_change = true;
 	}
 
+	cmd->cmd.pd_id = pd_id; /* valid only for v3 */
 	ocrdma_build_q_pages(&cmd->cmd.pa[0], hw_pages, cq->pa, page_size);
 	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
 	if (status)
@@ -1427,7 +1463,6 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq,
 	return 0;
 mbx_err:
 	ocrdma_unbind_eq(dev, cq->eqn);
-eq_err:
 	dma_free_coherent(&pdev->dev, cq->len, cq->va, cq->pa);
 mem_err:
 	kfree(cmd);
@@ -1524,6 +1559,7 @@ static int ocrdma_mbx_reg_mr(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr,
 		return -ENOMEM;
 	cmd->num_pbl_pdid =
 	    pdid | (hwmr->num_pbls << OCRDMA_REG_NSMR_NUM_PBL_SHIFT);
+	cmd->fr_mr = hwmr->fr_mr;
 
 	cmd->flags_hpage_pbe_sz |= (hwmr->remote_wr <<
 				    OCRDMA_REG_NSMR_REMOTE_WR_SHIFT);
@@ -1678,8 +1714,16 @@ void ocrdma_flush_qp(struct ocrdma_qp *qp)
 	spin_unlock_irqrestore(&qp->dev->flush_q_lock, flags);
 }
 
-int ocrdma_qp_state_machine(struct ocrdma_qp *qp, enum ib_qp_state new_ib_state,
-			    enum ib_qp_state *old_ib_state)
+static void ocrdma_init_hwq_ptr(struct ocrdma_qp *qp)
+{
+	qp->sq.head = 0;
+	qp->sq.tail = 0;
+	qp->rq.head = 0;
+	qp->rq.tail = 0;
+}
+
+int ocrdma_qp_state_change(struct ocrdma_qp *qp, enum ib_qp_state new_ib_state,
+			   enum ib_qp_state *old_ib_state)
 {
 	unsigned long flags;
 	int status = 0;
@@ -1696,96 +1740,15 @@ int ocrdma_qp_state_machine(struct ocrdma_qp *qp, enum ib_qp_state new_ib_state,
 		return 1;
 	}
 
-	switch (qp->state) {
-	case OCRDMA_QPS_RST:
-		switch (new_state) {
-		case OCRDMA_QPS_RST:
-		case OCRDMA_QPS_INIT:
-			break;
-		default:
-			status = -EINVAL;
-			break;
-		};
-		break;
-	case OCRDMA_QPS_INIT:
-		/* qps: INIT->XXX */
-		switch (new_state) {
-		case OCRDMA_QPS_INIT:
-		case OCRDMA_QPS_RTR:
-			break;
-		case OCRDMA_QPS_ERR:
-			ocrdma_flush_qp(qp);
-			break;
-		default:
-			status = -EINVAL;
-			break;
-		};
-		break;
-	case OCRDMA_QPS_RTR:
-		/* qps: RTS->XXX */
-		switch (new_state) {
-		case OCRDMA_QPS_RTS:
-			break;
-		case OCRDMA_QPS_ERR:
-			ocrdma_flush_qp(qp);
-			break;
-		default:
-			status = -EINVAL;
-			break;
-		};
-		break;
-	case OCRDMA_QPS_RTS:
-		/* qps: RTS->XXX */
-		switch (new_state) {
-		case OCRDMA_QPS_SQD:
-		case OCRDMA_QPS_SQE:
-			break;
-		case OCRDMA_QPS_ERR:
-			ocrdma_flush_qp(qp);
-			break;
-		default:
-			status = -EINVAL;
-			break;
-		};
-		break;
-	case OCRDMA_QPS_SQD:
-		/* qps: SQD->XXX */
-		switch (new_state) {
-		case OCRDMA_QPS_RTS:
-		case OCRDMA_QPS_SQE:
-		case OCRDMA_QPS_ERR:
-			break;
-		default:
-			status = -EINVAL;
-			break;
-		};
-		break;
-	case OCRDMA_QPS_SQE:
-		switch (new_state) {
-		case OCRDMA_QPS_RTS:
-		case OCRDMA_QPS_ERR:
-			break;
-		default:
-			status = -EINVAL;
-			break;
-		};
-		break;
-	case OCRDMA_QPS_ERR:
-		/* qps: ERR->XXX */
-		switch (new_state) {
-		case OCRDMA_QPS_RST:
-			break;
-		default:
-			status = -EINVAL;
-			break;
-		};
-		break;
-	default:
-		status = -EINVAL;
-		break;
-	};
-	if (!status)
-		qp->state = new_state;
+
+	if (new_state == OCRDMA_QPS_INIT) {
+		ocrdma_init_hwq_ptr(qp);
+		ocrdma_del_flush_qp(qp);
+	} else if (new_state == OCRDMA_QPS_ERR) {
+		ocrdma_flush_qp(qp);
+	}
+
+	qp->state = new_state;
 
 	spin_unlock_irqrestore(&qp->q_lock, flags);
 	return status;
@@ -1819,10 +1782,9 @@ static int ocrdma_set_create_qp_sq_cmd(struct ocrdma_create_qp_req *cmd,
 	u32 max_wqe_allocated;
 	u32 max_sges = attrs->cap.max_send_sge;
 
-	max_wqe_allocated = attrs->cap.max_send_wr;
-	/* need to allocate one extra to for GEN1 family */
-	if (dev->nic_info.dev_family != OCRDMA_GEN2_FAMILY)
-		max_wqe_allocated += 1;
+	/* QP1 may exceed 127 */
+	max_wqe_allocated = min_t(int, attrs->cap.max_send_wr + 1,
+				dev->attr.max_wqe);
 
 	status = ocrdma_build_q_conf(&max_wqe_allocated,
 		dev->attr.wqe_size, &hw_pages, &hw_page_size);
@@ -1934,6 +1896,8 @@ static int ocrdma_set_create_qp_ird_cmd(struct ocrdma_create_qp_req *cmd,
 	dma_addr_t pa = 0;
 	int ird_page_size = dev->attr.ird_page_size;
 	int ird_q_len = dev->attr.num_ird_pages * ird_page_size;
+	struct ocrdma_hdr_wqe *rqe;
+	int i  = 0;
 
 	if (dev->attr.ird == 0)
 		return 0;
@@ -1945,6 +1909,15 @@ static int ocrdma_set_create_qp_ird_cmd(struct ocrdma_create_qp_req *cmd,
 	memset(qp->ird_q_va, 0, ird_q_len);
 	ocrdma_build_q_pages(&cmd->ird_addr[0], dev->attr.num_ird_pages,
 			     pa, ird_page_size);
+	for (; i < ird_q_len / dev->attr.rqe_size; i++) {
+		rqe = (struct ocrdma_hdr_wqe *)(qp->ird_q_va +
+			(i * dev->attr.rqe_size));
+		rqe->cw = 0;
+		rqe->cw |= 2;
+		rqe->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT);
+		rqe->cw |= (8 << OCRDMA_WQE_SIZE_SHIFT);
+		rqe->cw |= (8 << OCRDMA_WQE_NXT_WQE_SIZE_SHIFT);
+	}
 	return 0;
 }
 
@@ -2057,9 +2030,10 @@ int ocrdma_mbx_create_qp(struct ocrdma_qp *qp, struct ib_qp_init_attr *attrs,
 	qp->rq_cq = cq;
 
 	if (pd->dpp_enabled && attrs->cap.max_inline_data && pd->num_dpp_qp &&
-	    (attrs->cap.max_inline_data <= dev->attr.max_inline_data))
+	    (attrs->cap.max_inline_data <= dev->attr.max_inline_data)) {
 		ocrdma_set_create_qp_dpp_cmd(cmd, pd, qp, enable_dpp_cq,
 					     dpp_cq_id);
+	}
 
 	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
 	if (status)
@@ -2108,38 +2082,48 @@ int ocrdma_resolve_dgid(struct ocrdma_dev *dev, union ib_gid *dgid,
 	struct in6_addr in6;
 
 	memcpy(&in6, dgid, sizeof in6);
-	if (rdma_is_multicast_addr(&in6))
+	if (rdma_is_multicast_addr(&in6)) {
 		rdma_get_mcast_mac(&in6, mac_addr);
-	else if (rdma_link_local_addr(&in6))
+	} else if (rdma_link_local_addr(&in6)) {
 		rdma_get_ll_mac(&in6, mac_addr);
-	else {
+	} else {
 		pr_err("%s() fail to resolve mac_addr.\n", __func__);
 		return -EINVAL;
 	}
 	return 0;
 }
 
-static void ocrdma_set_av_params(struct ocrdma_qp *qp,
+static int ocrdma_set_av_params(struct ocrdma_qp *qp,
 				struct ocrdma_modify_qp *cmd,
 				struct ib_qp_attr *attrs)
 {
+	int status;
 	struct ib_ah_attr *ah_attr = &attrs->ah_attr;
-	union ib_gid sgid;
+	union ib_gid sgid, zgid;
 	u32 vlan_id;
 	u8 mac_addr[6];
+
 	if ((ah_attr->ah_flags & IB_AH_GRH) == 0)
-		return;
+		return -EINVAL;
 	cmd->params.tclass_sq_psn |=
 	    (ah_attr->grh.traffic_class << OCRDMA_QP_PARAMS_TCLASS_SHIFT);
 	cmd->params.rnt_rc_sl_fl |=
 	    (ah_attr->grh.flow_label & OCRDMA_QP_PARAMS_FLOW_LABEL_MASK);
+	cmd->params.rnt_rc_sl_fl |= (ah_attr->sl << OCRDMA_QP_PARAMS_SL_SHIFT);
 	cmd->params.hop_lmt_rq_psn |=
 	    (ah_attr->grh.hop_limit << OCRDMA_QP_PARAMS_HOP_LMT_SHIFT);
 	cmd->flags |= OCRDMA_QP_PARA_FLOW_LBL_VALID;
 	memcpy(&cmd->params.dgid[0], &ah_attr->grh.dgid.raw[0],
 	       sizeof(cmd->params.dgid));
-	ocrdma_query_gid(&qp->dev->ibdev, 1,
+	status = ocrdma_query_gid(&qp->dev->ibdev, 1,
 			 ah_attr->grh.sgid_index, &sgid);
+	if (status)
+		return status;
+
+	memset(&zgid, 0, sizeof(zgid));
+	if (!memcmp(&sgid, &zgid, sizeof(zgid)))
+		return -EINVAL;
+
 	qp->sgid_idx = ah_attr->grh.sgid_index;
 	memcpy(&cmd->params.sgid[0], &sgid.raw[0], sizeof(cmd->params.sgid));
 	ocrdma_resolve_dgid(qp->dev, &ah_attr->grh.dgid, &mac_addr[0]);
@@ -2155,6 +2139,7 @@ static void ocrdma_set_av_params(struct ocrdma_qp *qp,
 		    vlan_id << OCRDMA_QP_PARAMS_VLAN_SHIFT;
 		cmd->flags |= OCRDMA_QP_PARA_VLAN_EN_VALID;
 	}
+	return 0;
 }
 
 static int ocrdma_set_qp_params(struct ocrdma_qp *qp,
@@ -2163,8 +2148,6 @@ static int ocrdma_set_qp_params(struct ocrdma_qp *qp,
 				enum ib_qp_state old_qps)
 {
 	int status = 0;
-	struct net_device *netdev = qp->dev->nic_info.netdev;
-	int eth_mtu = iboe_get_mtu(netdev->mtu);
 
 	if (attr_mask & IB_QP_PKEY_INDEX) {
 		cmd->params.path_mtu_pkey_indx |= (attrs->pkey_index &
@@ -2176,9 +2159,11 @@ static int ocrdma_set_qp_params(struct ocrdma_qp *qp,
 		cmd->params.qkey = attrs->qkey;
 		cmd->flags |= OCRDMA_QP_PARA_QKEY_VALID;
 	}
-	if (attr_mask & IB_QP_AV)
-		ocrdma_set_av_params(qp, cmd, attrs);
-	else if (qp->qp_type == IB_QPT_GSI || qp->qp_type == IB_QPT_UD) {
+	if (attr_mask & IB_QP_AV) {
+		status = ocrdma_set_av_params(qp, cmd, attrs);
+		if (status)
+			return status;
+	} else if (qp->qp_type == IB_QPT_GSI || qp->qp_type == IB_QPT_UD) {
 		/* set the default mac address for UD, GSI QPs */
 		cmd->params.dmac_b0_to_b3 = qp->dev->nic_info.mac_addr[0] |
 			(qp->dev->nic_info.mac_addr[1] << 8) |
@@ -2199,8 +2184,8 @@ static int ocrdma_set_qp_params(struct ocrdma_qp *qp,
 		cmd->flags |= OCRDMA_QP_PARA_DST_QPN_VALID;
 	}
 	if (attr_mask & IB_QP_PATH_MTU) {
-		if (ib_mtu_enum_to_int(eth_mtu) <
-		    ib_mtu_enum_to_int(attrs->path_mtu)) {
+		if (attrs->path_mtu < IB_MTU_256 ||
+		    attrs->path_mtu > IB_MTU_4096) {
 			status = -EINVAL;
 			goto pmtu_err;
 		}
@@ -2283,10 +2268,12 @@ int ocrdma_mbx_modify_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp,
 		     OCRDMA_QP_PARAMS_STATE_SHIFT) &
 		    OCRDMA_QP_PARAMS_STATE_MASK;
 		cmd->flags |= OCRDMA_QP_PARA_QPS_VALID;
-	} else
+	} else {
 		cmd->params.max_sge_recv_flags |=
 		    (qp->state << OCRDMA_QP_PARAMS_STATE_SHIFT) &
 		    OCRDMA_QP_PARAMS_STATE_MASK;
+	}
+
 	status = ocrdma_set_qp_params(qp, cmd, attrs, attr_mask, old_qps);
 	if (status)
 		goto mbx_err;
@@ -2324,7 +2311,7 @@ mbx_err:
 	return status;
 }
 
-int ocrdma_mbx_create_srq(struct ocrdma_srq *srq,
+int ocrdma_mbx_create_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq,
 			  struct ib_srq_init_attr *srq_attr,
 			  struct ocrdma_pd *pd)
 {
@@ -2334,7 +2321,6 @@ int ocrdma_mbx_create_srq(struct ocrdma_srq *srq,
 	struct ocrdma_create_srq_rsp *rsp;
 	struct ocrdma_create_srq *cmd;
 	dma_addr_t pa;
-	struct ocrdma_dev *dev = srq->dev;
 	struct pci_dev *pdev = dev->nic_info.pdev;
 	u32 max_rqe_allocated;
 
@@ -2404,13 +2390,16 @@ int ocrdma_mbx_modify_srq(struct ocrdma_srq *srq, struct ib_srq_attr *srq_attr)
 {
 	int status = -ENOMEM;
 	struct ocrdma_modify_srq *cmd;
-	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_SRQ, sizeof(*cmd));
+	struct ocrdma_pd *pd = srq->pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_SRQ, sizeof(*cmd));
 	if (!cmd)
 		return status;
 	cmd->id = srq->id;
 	cmd->limit_max_rqe |= srq_attr->srq_limit <<
 	    OCRDMA_MODIFY_SRQ_LIMIT_SHIFT;
-	status = ocrdma_mbx_cmd(srq->dev, (struct ocrdma_mqe *)cmd);
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
 	kfree(cmd);
 	return status;
 }
@@ -2419,11 +2408,13 @@ int ocrdma_mbx_query_srq(struct ocrdma_srq *srq, struct ib_srq_attr *srq_attr)
 {
 	int status = -ENOMEM;
 	struct ocrdma_query_srq *cmd;
-	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_SRQ, sizeof(*cmd));
+	struct ocrdma_dev *dev = get_ocrdma_dev(srq->ibsrq.device);
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_SRQ, sizeof(*cmd));
 	if (!cmd)
 		return status;
 	cmd->id = srq->rq.dbid;
-	status = ocrdma_mbx_cmd(srq->dev, (struct ocrdma_mqe *)cmd);
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
 	if (status == 0) {
 		struct ocrdma_query_srq_rsp *rsp =
 		    (struct ocrdma_query_srq_rsp *)cmd;
@@ -2448,7 +2439,7 @@ int ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq)
 	if (!cmd)
 		return status;
 	cmd->id = srq->id;
-	status = ocrdma_mbx_cmd(srq->dev, (struct ocrdma_mqe *)cmd);
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
 	if (srq->rq.va)
 		dma_free_coherent(&pdev->dev, srq->rq.len,
 				  srq->rq.va, srq->rq.pa);
@@ -2490,38 +2481,7 @@ int ocrdma_free_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah)
 	return 0;
 }
 
-static int ocrdma_create_mq_eq(struct ocrdma_dev *dev)
-{
-	int status;
-	int irq;
-	unsigned long flags = 0;
-	int num_eq = 0;
-
-	if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX)
-		flags = IRQF_SHARED;
-	else {
-		num_eq = dev->nic_info.msix.num_vectors -
-				dev->nic_info.msix.start_vector;
-		/* minimum two vectors/eq are required for rdma to work.
-		 * one for control path and one for data path.
-		 */
-		if (num_eq < 2)
-			return -EBUSY;
-	}
-
-	status = ocrdma_create_eq(dev, &dev->meq, OCRDMA_EQ_LEN);
-	if (status)
-		return status;
-	sprintf(dev->meq.irq_name, "ocrdma_mq%d", dev->id);
-	irq = ocrdma_get_irq(dev, &dev->meq);
-	status = request_irq(irq, ocrdma_irq_handler, flags, dev->meq.irq_name,
-			     &dev->meq);
-	if (status)
-		_ocrdma_destroy_eq(dev, &dev->meq);
-	return status;
-}
-
-static int ocrdma_create_qp_eqs(struct ocrdma_dev *dev)
+static int ocrdma_create_eqs(struct ocrdma_dev *dev)
 {
 	int num_eq, i, status = 0;
 	int irq;
@@ -2532,49 +2492,47 @@ static int ocrdma_create_qp_eqs(struct ocrdma_dev *dev)
 	if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX) {
 		num_eq = 1;
 		flags = IRQF_SHARED;
-	} else
+	} else {
 		num_eq = min_t(u32, num_eq, num_online_cpus());
-	dev->qp_eq_tbl = kzalloc(sizeof(struct ocrdma_eq) * num_eq, GFP_KERNEL);
-	if (!dev->qp_eq_tbl)
+	}
+
+	if (!num_eq)
+		return -EINVAL;
+
+	dev->eq_tbl = kzalloc(sizeof(struct ocrdma_eq) * num_eq, GFP_KERNEL);
+	if (!dev->eq_tbl)
 		return -ENOMEM;
 
 	for (i = 0; i < num_eq; i++) {
-		status = ocrdma_create_eq(dev, &dev->qp_eq_tbl[i],
+		status = ocrdma_create_eq(dev, &dev->eq_tbl[i],
 					  OCRDMA_EQ_LEN);
 		if (status) {
 			status = -EINVAL;
 			break;
 		}
-		sprintf(dev->qp_eq_tbl[i].irq_name, "ocrdma_qp%d-%d",
+		sprintf(dev->eq_tbl[i].irq_name, "ocrdma%d-%d",
 			dev->id, i);
-		irq = ocrdma_get_irq(dev, &dev->qp_eq_tbl[i]);
+		irq = ocrdma_get_irq(dev, &dev->eq_tbl[i]);
 		status = request_irq(irq, ocrdma_irq_handler, flags,
-				     dev->qp_eq_tbl[i].irq_name,
-				     &dev->qp_eq_tbl[i]);
-		if (status) {
-			_ocrdma_destroy_eq(dev, &dev->qp_eq_tbl[i]);
-			status = -EINVAL;
-			break;
-		}
+				     dev->eq_tbl[i].irq_name,
+				     &dev->eq_tbl[i]);
+		if (status)
+			goto done;
 		dev->eq_cnt += 1;
 	}
 	/* one eq is sufficient for data path to work */
-	if (dev->eq_cnt >= 1)
-		return 0;
-	if (status)
-		ocrdma_destroy_qp_eqs(dev);
+	return 0;
+done:
+	ocrdma_destroy_eqs(dev);
 	return status;
 }
 
 int ocrdma_init_hw(struct ocrdma_dev *dev)
 {
 	int status;
-	/* set up control path eq */
-	status = ocrdma_create_mq_eq(dev);
-	if (status)
-		return status;
-	/* set up data path eq */
-	status = ocrdma_create_qp_eqs(dev);
+
+	/* create the eqs  */
+	status = ocrdma_create_eqs(dev);
 	if (status)
 		goto qpeq_err;
 	status = ocrdma_create_mq(dev);
@@ -2597,9 +2555,8 @@ int ocrdma_init_hw(struct ocrdma_dev *dev)
 conf_err:
 	ocrdma_destroy_mq(dev);
 mq_err:
-	ocrdma_destroy_qp_eqs(dev);
+	ocrdma_destroy_eqs(dev);
 qpeq_err:
-	ocrdma_destroy_eq(dev, &dev->meq);
 	pr_err("%s() status=%d\n", __func__, status);
 	return status;
 }
@@ -2608,10 +2565,9 @@ void ocrdma_cleanup_hw(struct ocrdma_dev *dev)
 {
 	ocrdma_mbx_delete_ah_tbl(dev);
 
-	/* cleanup the data path eqs */
-	ocrdma_destroy_qp_eqs(dev);
+	/* cleanup the eqs */
+	ocrdma_destroy_eqs(dev);
 
 	/* cleanup the control path */
 	ocrdma_destroy_mq(dev);
-	ocrdma_destroy_eq(dev, &dev->meq);
 }

+ 9 - 4
drivers/infiniband/hw/ocrdma/ocrdma_hw.h

@@ -78,6 +78,11 @@ static inline void ocrdma_copy_le32_to_cpu(void *dst, void *src, u32 len)
 #endif
 }
 
+static inline u64 ocrdma_get_db_addr(struct ocrdma_dev *dev, u32 pdid)
+{
+	return dev->nic_info.unmapped_db + (pdid * dev->nic_info.db_page_size);
+}
+
 int ocrdma_init_hw(struct ocrdma_dev *);
 void ocrdma_cleanup_hw(struct ocrdma_dev *);
 
@@ -86,6 +91,7 @@ void ocrdma_ring_cq_db(struct ocrdma_dev *, u16 cq_id, bool armed,
 		       bool solicited, u16 cqe_popped);
 
 /* verbs specific mailbox commands */
+int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed);
 int ocrdma_query_config(struct ocrdma_dev *,
 			struct ocrdma_mbx_query_config *config);
 int ocrdma_resolve_dgid(struct ocrdma_dev *, union ib_gid *dgid, u8 *mac_addr);
@@ -100,7 +106,7 @@ int ocrdma_mbx_dealloc_lkey(struct ocrdma_dev *, int fmr, u32 lkey);
 int ocrdma_reg_mr(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr,
 			u32 pd_id, int acc);
 int ocrdma_mbx_create_cq(struct ocrdma_dev *, struct ocrdma_cq *,
-				int entries, int dpp_cq);
+				int entries, int dpp_cq, u16 pd_id);
 int ocrdma_mbx_destroy_cq(struct ocrdma_dev *, struct ocrdma_cq *);
 
 int ocrdma_mbx_create_qp(struct ocrdma_qp *, struct ib_qp_init_attr *attrs,
@@ -112,8 +118,7 @@ int ocrdma_mbx_modify_qp(struct ocrdma_dev *, struct ocrdma_qp *,
 int ocrdma_mbx_query_qp(struct ocrdma_dev *, struct ocrdma_qp *,
 			struct ocrdma_qp_params *param);
 int ocrdma_mbx_destroy_qp(struct ocrdma_dev *, struct ocrdma_qp *);
-
-int ocrdma_mbx_create_srq(struct ocrdma_srq *,
+int ocrdma_mbx_create_srq(struct ocrdma_dev *, struct ocrdma_srq *,
 			  struct ib_srq_init_attr *,
 			  struct ocrdma_pd *);
 int ocrdma_mbx_modify_srq(struct ocrdma_srq *, struct ib_srq_attr *);
@@ -123,7 +128,7 @@ int ocrdma_mbx_destroy_srq(struct ocrdma_dev *, struct ocrdma_srq *);
 int ocrdma_alloc_av(struct ocrdma_dev *, struct ocrdma_ah *);
 int ocrdma_free_av(struct ocrdma_dev *, struct ocrdma_ah *);
 
-int ocrdma_qp_state_machine(struct ocrdma_qp *, enum ib_qp_state new_state,
+int ocrdma_qp_state_change(struct ocrdma_qp *, enum ib_qp_state new_state,
 			    enum ib_qp_state *old_ib_state);
 bool ocrdma_is_qp_in_sq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *);
 bool ocrdma_is_qp_in_rq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *);

+ 7 - 0
drivers/infiniband/hw/ocrdma/ocrdma_main.c

@@ -39,6 +39,7 @@
 #include "ocrdma_ah.h"
 #include "be_roce.h"
 #include "ocrdma_hw.h"
+#include "ocrdma_abi.h"
 
 MODULE_VERSION(OCRDMA_ROCE_DEV_VERSION);
 MODULE_DESCRIPTION("Emulex RoCE HCA Driver");
@@ -265,6 +266,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
 	memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC,
 	       sizeof(OCRDMA_NODE_DESC));
 	dev->ibdev.owner = THIS_MODULE;
+	dev->ibdev.uverbs_abi_ver = OCRDMA_ABI_VERSION;
 	dev->ibdev.uverbs_cmd_mask =
 	    OCRDMA_UVERBS(GET_CONTEXT) |
 	    OCRDMA_UVERBS(QUERY_DEVICE) |
@@ -326,9 +328,14 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
 	dev->ibdev.req_notify_cq = ocrdma_arm_cq;
 
 	dev->ibdev.get_dma_mr = ocrdma_get_dma_mr;
+	dev->ibdev.reg_phys_mr = ocrdma_reg_kernel_mr;
 	dev->ibdev.dereg_mr = ocrdma_dereg_mr;
 	dev->ibdev.reg_user_mr = ocrdma_reg_user_mr;
 
+	dev->ibdev.alloc_fast_reg_mr = ocrdma_alloc_frmr;
+	dev->ibdev.alloc_fast_reg_page_list = ocrdma_alloc_frmr_page_list;
+	dev->ibdev.free_fast_reg_page_list = ocrdma_free_frmr_page_list;
+
 	/* mandatory to support user space verbs consumer. */
 	dev->ibdev.alloc_ucontext = ocrdma_alloc_ucontext;
 	dev->ibdev.dealloc_ucontext = ocrdma_dealloc_ucontext;

+ 136 - 74
drivers/infiniband/hw/ocrdma/ocrdma_sli.h

@@ -70,6 +70,7 @@ enum {
 
 #define OCRDMA_SUBSYS_COMMON 1
 enum {
+	OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1 = 5,
 	OCRDMA_CMD_CREATE_CQ		= 12,
 	OCRDMA_CMD_CREATE_EQ		= 13,
 	OCRDMA_CMD_CREATE_MQ		= 21,
@@ -91,15 +92,15 @@ enum {
 
 #define OCRDMA_MAX_QP    2048
 #define OCRDMA_MAX_CQ    2048
+#define OCRDMA_MAX_STAG  8192
 
 enum {
 	OCRDMA_DB_RQ_OFFSET		= 0xE0,
-	OCRDMA_DB_GEN2_RQ1_OFFSET	= 0x100,
-	OCRDMA_DB_GEN2_RQ2_OFFSET	= 0xC0,
+	OCRDMA_DB_GEN2_RQ_OFFSET        = 0x100,
 	OCRDMA_DB_SQ_OFFSET		= 0x60,
 	OCRDMA_DB_GEN2_SQ_OFFSET	= 0x1C0,
 	OCRDMA_DB_SRQ_OFFSET		= OCRDMA_DB_RQ_OFFSET,
-	OCRDMA_DB_GEN2_SRQ_OFFSET	= OCRDMA_DB_GEN2_RQ1_OFFSET,
+	OCRDMA_DB_GEN2_SRQ_OFFSET	= OCRDMA_DB_GEN2_RQ_OFFSET,
 	OCRDMA_DB_CQ_OFFSET		= 0x120,
 	OCRDMA_DB_EQ_OFFSET		= OCRDMA_DB_CQ_OFFSET,
 	OCRDMA_DB_MQ_OFFSET		= 0x140
@@ -143,8 +144,11 @@ enum {
 # 2: 16K Bytes
 # 3: 32K Bytes
 # 4: 64K Bytes
+# 5: 128K Bytes
+# 6: 256K Bytes
+# 7: 512K Bytes
 */
-#define OCRDMA_MAX_Q_PAGE_SIZE_CNT (5)
+#define OCRDMA_MAX_Q_PAGE_SIZE_CNT (8)
 #define OCRDMA_Q_PAGE_BASE_SIZE (OCRDMA_MIN_Q_PAGE_SIZE * OCRDMA_MAX_Q_PAGES)
 
 #define MAX_OCRDMA_QP_PAGES      (8)
@@ -177,7 +181,7 @@ struct ocrdma_mbx_hdr {
 	u32 timeout;		/* in seconds */
 	u32 cmd_len;
 	u32 rsvd_version;
-} __packed;
+};
 
 enum {
 	OCRDMA_MBX_RSP_OPCODE_SHIFT	= 0,
@@ -197,7 +201,7 @@ struct ocrdma_mbx_rsp {
 	u32 status;
 	u32 rsp_len;
 	u32 add_rsp_len;
-} __packed;
+};
 
 enum {
 	OCRDMA_MQE_EMBEDDED	= 1,
@@ -208,7 +212,7 @@ struct ocrdma_mqe_sge {
 	u32 pa_lo;
 	u32 pa_hi;
 	u32 len;
-} __packed;
+};
 
 enum {
 	OCRDMA_MQE_HDR_EMB_SHIFT	= 0,
@@ -225,12 +229,12 @@ struct ocrdma_mqe_hdr {
 	u32 tag_lo;
 	u32 tag_hi;
 	u32 rsvd3;
-} __packed;
+};
 
 struct ocrdma_mqe_emb_cmd {
 	struct ocrdma_mbx_hdr mch;
 	u8 pyld[220];
-} __packed;
+};
 
 struct ocrdma_mqe {
 	struct ocrdma_mqe_hdr hdr;
@@ -242,7 +246,7 @@ struct ocrdma_mqe {
 		u8 cmd[236];
 		struct ocrdma_mbx_rsp rsp;
 	} u;
-} __packed;
+};
 
 #define OCRDMA_EQ_LEN       4096
 #define OCRDMA_MQ_CQ_LEN    256
@@ -259,12 +263,12 @@ struct ocrdma_mqe {
 struct ocrdma_delete_q_req {
 	struct ocrdma_mbx_hdr req;
 	u32 id;
-} __packed;
+};
 
 struct ocrdma_pa {
 	u32 lo;
 	u32 hi;
-} __packed;
+};
 
 #define MAX_OCRDMA_EQ_PAGES (8)
 struct ocrdma_create_eq_req {
@@ -275,7 +279,7 @@ struct ocrdma_create_eq_req {
 	u32 delay;
 	u32 rsvd;
 	struct ocrdma_pa pa[MAX_OCRDMA_EQ_PAGES];
-} __packed;
+};
 
 enum {
 	OCRDMA_CREATE_EQ_VALID	= Bit(29),
@@ -310,7 +314,7 @@ struct ocrdma_mcqe {
 	u32 tag_lo;
 	u32 tag_hi;
 	u32 valid_ae_cmpl_cons;
-} __packed;
+};
 
 enum {
 	OCRDMA_AE_MCQE_QPVALID		= Bit(31),
@@ -332,7 +336,21 @@ struct ocrdma_ae_mcqe {
 	u32 cqvalid_cqid;
 	u32 evt_tag;
 	u32 valid_ae_event;
-} __packed;
+};
+
+enum {
+	OCRDMA_AE_PVID_MCQE_ENABLED_SHIFT = 0,
+	OCRDMA_AE_PVID_MCQE_ENABLED_MASK  = 0xFF,
+	OCRDMA_AE_PVID_MCQE_TAG_SHIFT = 16,
+	OCRDMA_AE_PVID_MCQE_TAG_MASK = 0xFFFF << OCRDMA_AE_PVID_MCQE_TAG_SHIFT
+};
+
+struct ocrdma_ae_pvid_mcqe {
+	u32 tag_enabled;
+	u32 event_tag;
+	u32 rsvd1;
+	u32 rsvd2;
+};
 
 enum {
 	OCRDMA_AE_MPA_MCQE_REQ_ID_SHIFT		= 16,
@@ -356,7 +374,7 @@ struct ocrdma_ae_mpa_mcqe {
 	u32 w1;
 	u32 w2;
 	u32 valid_ae_event;
-} __packed;
+};
 
 enum {
 	OCRDMA_AE_QP_MCQE_NEW_QP_STATE_SHIFT	= 0,
@@ -382,9 +400,11 @@ struct ocrdma_ae_qp_mcqe {
 	u32 w1;
 	u32 w2;
 	u32 valid_ae_event;
-} __packed;
+};
 
-#define OCRDMA_ASYNC_EVE_CODE 0x14
+#define OCRDMA_ASYNC_RDMA_EVE_CODE 0x14
+#define OCRDMA_ASYNC_GRP5_EVE_CODE 0x5
+#define OCRDMA_ASYNC_EVENT_PVID_STATE 0x3
 
 enum OCRDMA_ASYNC_EVENT_TYPE {
 	OCRDMA_CQ_ERROR			= 0x00,
@@ -487,7 +507,8 @@ struct ocrdma_mbx_query_config {
 	u32 max_ird_ord_per_qp;
 	u32 max_shared_ird_ord;
 	u32 max_mr;
-	u64 max_mr_size;
+	u32 max_mr_size_lo;
+	u32 max_mr_size_hi;
 	u32 max_num_mr_pbl;
 	u32 max_mw;
 	u32 max_fmr;
@@ -502,14 +523,14 @@ struct ocrdma_mbx_query_config {
 	u32 max_wqes_rqes_per_q;
 	u32 max_cq_cqes_per_cq;
 	u32 max_srq_rqe_sge;
-} __packed;
+};
 
 struct ocrdma_fw_ver_rsp {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_rsp rsp;
 
 	u8 running_ver[32];
-} __packed;
+};
 
 struct ocrdma_fw_conf_rsp {
 	struct ocrdma_mqe_hdr hdr;
@@ -535,14 +556,41 @@ struct ocrdma_fw_conf_rsp {
 	u32 base_eqid;
 	u32 max_eq;
 
-} __packed;
+};
 
 enum {
 	OCRDMA_FN_MODE_RDMA	= 0x4
 };
 
+struct ocrdma_get_link_speed_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u8 pt_port_num;
+	u8 link_duplex;
+	u8 phys_port_speed;
+	u8 phys_port_fault;
+	u16 rsvd1;
+	u16 qos_lnk_speed;
+	u8 logical_lnk_status;
+	u8 rsvd2[3];
+};
+
+enum {
+	OCRDMA_PHYS_LINK_SPEED_ZERO = 0x0,
+	OCRDMA_PHYS_LINK_SPEED_10MBPS = 0x1,
+	OCRDMA_PHYS_LINK_SPEED_100MBPS = 0x2,
+	OCRDMA_PHYS_LINK_SPEED_1GBPS = 0x3,
+	OCRDMA_PHYS_LINK_SPEED_10GBPS = 0x4,
+	OCRDMA_PHYS_LINK_SPEED_20GBPS = 0x5,
+	OCRDMA_PHYS_LINK_SPEED_25GBPS = 0x6,
+	OCRDMA_PHYS_LINK_SPEED_40GBPS = 0x7,
+	OCRDMA_PHYS_LINK_SPEED_100GBPS = 0x8
+};
+
 enum {
 	OCRDMA_CREATE_CQ_VER2			= 2,
+	OCRDMA_CREATE_CQ_VER3                   = 3,
 
 	OCRDMA_CREATE_CQ_PAGE_CNT_MASK		= 0xFFFF,
 	OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT	= 16,
@@ -576,7 +624,8 @@ struct ocrdma_create_cq_cmd {
 	u32 pgsz_pgcnt;
 	u32 ev_cnt_flags;
 	u32 eqn;
-	u32 cqe_count;
+	u16 cqe_count;
+	u16 pd_id;
 	u32 rsvd6;
 	struct ocrdma_pa pa[OCRDMA_CREATE_CQ_MAX_PAGES];
 };
@@ -584,7 +633,7 @@ struct ocrdma_create_cq_cmd {
 struct ocrdma_create_cq {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_create_cq_cmd cmd;
-} __packed;
+};
 
 enum {
 	OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK	= 0xFFFF
@@ -593,12 +642,12 @@ enum {
 struct ocrdma_create_cq_cmd_rsp {
 	struct ocrdma_mbx_rsp rsp;
 	u32 cq_id;
-} __packed;
+};
 
 struct ocrdma_create_cq_rsp {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_create_cq_cmd_rsp rsp;
-} __packed;
+};
 
 enum {
 	OCRDMA_CREATE_MQ_V0_CQ_ID_SHIFT		= 22,
@@ -617,12 +666,12 @@ struct ocrdma_create_mq_req {
 	u32 async_cqid_valid;
 	u32 rsvd;
 	struct ocrdma_pa pa[8];
-} __packed;
+};
 
 struct ocrdma_create_mq_rsp {
 	struct ocrdma_mbx_rsp rsp;
 	u32 id;
-} __packed;
+};
 
 enum {
 	OCRDMA_DESTROY_CQ_QID_SHIFT			= 0,
@@ -637,12 +686,12 @@ struct ocrdma_destroy_cq {
 	struct ocrdma_mbx_hdr req;
 
 	u32 bypass_flush_qid;
-} __packed;
+};
 
 struct ocrdma_destroy_cq_rsp {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_rsp rsp;
-} __packed;
+};
 
 enum {
 	OCRDMA_QPT_GSI	= 1,
@@ -766,7 +815,7 @@ struct ocrdma_create_qp_req {
 	u32 dpp_credits_cqid;
 	u32 rpir_lkey;
 	struct ocrdma_pa ird_addr[MAX_OCRDMA_IRD_PAGES];
-} __packed;
+};
 
 enum {
 	OCRDMA_CREATE_QP_RSP_QP_ID_SHIFT		= 0,
@@ -820,18 +869,18 @@ struct ocrdma_create_qp_rsp {
 	u32 max_ord_ird;
 	u32 sq_rq_id;
 	u32 dpp_response;
-} __packed;
+};
 
 struct ocrdma_destroy_qp {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_hdr req;
 	u32 qp_id;
-} __packed;
+};
 
 struct ocrdma_destroy_qp_rsp {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_rsp rsp;
-} __packed;
+};
 
 enum {
 	OCRDMA_MODIFY_QP_ID_SHIFT	= 0,
@@ -975,7 +1024,7 @@ struct ocrdma_qp_params {
 	u32 dmac_b0_to_b3;
 	u32 vlan_dmac_b4_to_b5;
 	u32 qkey;
-} __packed;
+};
 
 
 struct ocrdma_modify_qp {
@@ -986,7 +1035,7 @@ struct ocrdma_modify_qp {
 	u32 flags;
 	u32 rdma_flags;
 	u32 num_outstanding_atomic_rd;
-} __packed;
+};
 
 enum {
 	OCRDMA_MODIFY_QP_RSP_MAX_RQE_SHIFT	= 0,
@@ -1007,7 +1056,7 @@ struct ocrdma_modify_qp_rsp {
 
 	u32 max_wqe_rqe;
 	u32 max_ord_ird;
-} __packed;
+};
 
 struct ocrdma_query_qp {
 	struct ocrdma_mqe_hdr hdr;
@@ -1016,13 +1065,13 @@ struct ocrdma_query_qp {
 #define OCRDMA_QUERY_UP_QP_ID_SHIFT 0
 #define OCRDMA_QUERY_UP_QP_ID_MASK   0xFFFFFF
 	u32 qp_id;
-} __packed;
+};
 
 struct ocrdma_query_qp_rsp {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_rsp rsp;
 	struct ocrdma_qp_params params;
-} __packed;
+};
 
 enum {
 	OCRDMA_CREATE_SRQ_PD_ID_SHIFT		= 0,
@@ -1051,7 +1100,7 @@ struct ocrdma_create_srq {
 	u32 max_sge_rqe;
 	u32 pages_rqe_sz;
 	struct ocrdma_pa rq_addr[MAX_OCRDMA_SRQ_PAGES];
-} __packed;
+};
 
 enum {
 	OCRDMA_CREATE_SRQ_RSP_SRQ_ID_SHIFT			= 0,
@@ -1070,7 +1119,7 @@ struct ocrdma_create_srq_rsp {
 
 	u32 id;
 	u32 max_sge_rqe_allocated;
-} __packed;
+};
 
 enum {
 	OCRDMA_MODIFY_SRQ_ID_SHIFT	= 0,
@@ -1089,7 +1138,7 @@ struct ocrdma_modify_srq {
 
 	u32 id;
 	u32 limit_max_rqe;
-} __packed;
+};
 
 enum {
 	OCRDMA_QUERY_SRQ_ID_SHIFT	= 0,
@@ -1101,7 +1150,7 @@ struct ocrdma_query_srq {
 	struct ocrdma_mbx_rsp req;
 
 	u32 id;
-} __packed;
+};
 
 enum {
 	OCRDMA_QUERY_SRQ_RSP_PD_ID_SHIFT	= 0,
@@ -1123,7 +1172,7 @@ struct ocrdma_query_srq_rsp {
 
 	u32 max_rqe_pdid;
 	u32 srq_lmt_max_sge;
-} __packed;
+};
 
 enum {
 	OCRDMA_DESTROY_SRQ_ID_SHIFT	= 0,
@@ -1135,7 +1184,7 @@ struct ocrdma_destroy_srq {
 	struct ocrdma_mbx_rsp req;
 
 	u32 id;
-} __packed;
+};
 
 enum {
 	OCRDMA_ALLOC_PD_ENABLE_DPP	= BIT(16),
@@ -1147,7 +1196,7 @@ struct ocrdma_alloc_pd {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_hdr req;
 	u32 enable_dpp_rsvd;
-} __packed;
+};
 
 enum {
 	OCRDMA_ALLOC_PD_RSP_DPP			= Bit(16),
@@ -1159,18 +1208,18 @@ struct ocrdma_alloc_pd_rsp {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_rsp rsp;
 	u32 dpp_page_pdid;
-} __packed;
+};
 
 struct ocrdma_dealloc_pd {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_hdr req;
 	u32 id;
-} __packed;
+};
 
 struct ocrdma_dealloc_pd_rsp {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_rsp rsp;
-} __packed;
+};
 
 enum {
 	OCRDMA_ADDR_CHECK_ENABLE	= 1,
@@ -1206,7 +1255,7 @@ struct ocrdma_alloc_lkey {
 
 	u32 pdid;
 	u32 pbl_sz_flags;
-} __packed;
+};
 
 struct ocrdma_alloc_lkey_rsp {
 	struct ocrdma_mqe_hdr hdr;
@@ -1214,7 +1263,7 @@ struct ocrdma_alloc_lkey_rsp {
 
 	u32 lrkey;
 	u32 num_pbl_rsvd;
-} __packed;
+};
 
 struct ocrdma_dealloc_lkey {
 	struct ocrdma_mqe_hdr hdr;
@@ -1222,12 +1271,12 @@ struct ocrdma_dealloc_lkey {
 
 	u32 lkey;
 	u32 rsvd_frmr;
-} __packed;
+};
 
 struct ocrdma_dealloc_lkey_rsp {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_rsp rsp;
-} __packed;
+};
 
 #define MAX_OCRDMA_NSMR_PBL    (u32)22
 #define MAX_OCRDMA_PBL_SIZE     65536
@@ -1273,7 +1322,7 @@ struct ocrdma_reg_nsmr {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_hdr cmd;
 
-	u32 lrkey_key_index;
+	u32 fr_mr;
 	u32 num_pbl_pdid;
 	u32 flags_hpage_pbe_sz;
 	u32 totlen_low;
@@ -1283,7 +1332,7 @@ struct ocrdma_reg_nsmr {
 	u32 va_loaddr;
 	u32 va_hiaddr;
 	struct ocrdma_pa pbl[MAX_OCRDMA_NSMR_PBL];
-} __packed;
+};
 
 enum {
 	OCRDMA_REG_NSMR_CONT_PBL_SHIFT		= 0,
@@ -1305,12 +1354,12 @@ struct ocrdma_reg_nsmr_cont {
 	u32 last;
 
 	struct ocrdma_pa pbl[MAX_OCRDMA_NSMR_PBL];
-} __packed;
+};
 
 struct ocrdma_pbe {
 	u32 pa_hi;
 	u32 pa_lo;
-} __packed;
+};
 
 enum {
 	OCRDMA_REG_NSMR_RSP_NUM_PBL_SHIFT	= 16,
@@ -1322,7 +1371,7 @@ struct ocrdma_reg_nsmr_rsp {
 
 	u32 lrkey;
 	u32 num_pbl;
-} __packed;
+};
 
 enum {
 	OCRDMA_REG_NSMR_CONT_RSP_LRKEY_INDEX_SHIFT	= 0,
@@ -1342,7 +1391,7 @@ struct ocrdma_reg_nsmr_cont_rsp {
 
 	u32 lrkey_key_index;
 	u32 num_pbl;
-} __packed;
+};
 
 enum {
 	OCRDMA_ALLOC_MW_PD_ID_SHIFT	= 0,
@@ -1354,7 +1403,7 @@ struct ocrdma_alloc_mw {
 	struct ocrdma_mbx_hdr req;
 
 	u32 pdid;
-} __packed;
+};
 
 enum {
 	OCRDMA_ALLOC_MW_RSP_LRKEY_INDEX_SHIFT	= 0,
@@ -1366,7 +1415,7 @@ struct ocrdma_alloc_mw_rsp {
 	struct ocrdma_mbx_rsp rsp;
 
 	u32 lrkey_index;
-} __packed;
+};
 
 struct ocrdma_attach_mcast {
 	struct ocrdma_mqe_hdr hdr;
@@ -1375,12 +1424,12 @@ struct ocrdma_attach_mcast {
 	u8 mgid[16];
 	u32 mac_b0_to_b3;
 	u32 vlan_mac_b4_to_b5;
-} __packed;
+};
 
 struct ocrdma_attach_mcast_rsp {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_rsp rsp;
-} __packed;
+};
 
 struct ocrdma_detach_mcast {
 	struct ocrdma_mqe_hdr hdr;
@@ -1389,12 +1438,12 @@ struct ocrdma_detach_mcast {
 	u8 mgid[16];
 	u32 mac_b0_to_b3;
 	u32 vlan_mac_b4_to_b5;
-} __packed;
+};
 
 struct ocrdma_detach_mcast_rsp {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_rsp rsp;
-} __packed;
+};
 
 enum {
 	OCRDMA_CREATE_AH_NUM_PAGES_SHIFT	= 19,
@@ -1418,24 +1467,24 @@ struct ocrdma_create_ah_tbl {
 
 	u32 ah_conf;
 	struct ocrdma_pa tbl_addr[8];
-} __packed;
+};
 
 struct ocrdma_create_ah_tbl_rsp {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_rsp rsp;
 	u32 ahid;
-} __packed;
+};
 
 struct ocrdma_delete_ah_tbl {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_hdr req;
 	u32 ahid;
-} __packed;
+};
 
 struct ocrdma_delete_ah_tbl_rsp {
 	struct ocrdma_mqe_hdr hdr;
 	struct ocrdma_mbx_rsp rsp;
-} __packed;
+};
 
 enum {
 	OCRDMA_EQE_VALID_SHIFT		= 0,
@@ -1448,7 +1497,7 @@ enum {
 
 struct ocrdma_eqe {
 	u32 id_valid;
-} __packed;
+};
 
 enum OCRDMA_CQE_STATUS {
 	OCRDMA_CQE_SUCCESS = 0,
@@ -1532,14 +1581,14 @@ struct ocrdma_cqe {
 		} cmn;
 	};
 	u32 flags_status_srcqpn;	/* w3 */
-} __packed;
+};
 
 struct ocrdma_sge {
 	u32 addr_hi;
 	u32 addr_lo;
 	u32 lrkey;
 	u32 len;
-} __packed;
+};
 
 enum {
 	OCRDMA_FLAG_SIG		= 0x1,
@@ -1563,6 +1612,7 @@ enum OCRDMA_WQE_OPCODE {
 	OCRDMA_SEND		= 0x00,
 	OCRDMA_CMP_SWP		= 0x14,
 	OCRDMA_BIND_MW		= 0x10,
+	OCRDMA_FR_MR            = 0x11,
 	OCRDMA_RESV1		= 0x0A,
 	OCRDMA_LKEY_INV		= 0x15,
 	OCRDMA_FETCH_ADD	= 0x13,
@@ -1600,14 +1650,26 @@ struct ocrdma_hdr_wqe {
 		u32 lkey;
 	};
 	u32 total_len;
-} __packed;
+};
 
 struct ocrdma_ewqe_ud_hdr {
 	u32 rsvd_dest_qpn;
 	u32 qkey;
 	u32 rsvd_ahid;
 	u32 rsvd;
-} __packed;
+};
+
+/* extended wqe followed by hdr_wqe for Fast Memory register */
+struct ocrdma_ewqe_fr {
+	u32 va_hi;
+	u32 va_lo;
+	u32 fbo_hi;
+	u32 fbo_lo;
+	u32 size_sge;
+	u32 num_sges;
+	u32 rsvd;
+	u32 rsvd2;
+};
 
 struct ocrdma_eth_basic {
 	u8 dmac[6];

File diff suppressed because it is too large
+ 353 - 147
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c


+ 6 - 0
drivers/infiniband/hw/ocrdma/ocrdma_verbs.h

@@ -72,6 +72,7 @@ int ocrdma_query_qp(struct ib_qp *,
 		    struct ib_qp_attr *qp_attr,
 		    int qp_attr_mask, struct ib_qp_init_attr *);
 int ocrdma_destroy_qp(struct ib_qp *);
+void ocrdma_del_flush_qp(struct ocrdma_qp *qp);
 
 struct ib_srq *ocrdma_create_srq(struct ib_pd *, struct ib_srq_init_attr *,
 				 struct ib_udata *);
@@ -89,5 +90,10 @@ struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *,
 				   int num_phys_buf, int acc, u64 *iova_start);
 struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
 				 u64 virt, int acc, struct ib_udata *);
+struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *pd, int max_page_list_len);
+struct ib_fast_reg_page_list *ocrdma_alloc_frmr_page_list(struct ib_device
+							*ibdev,
+							int page_list_len);
+void ocrdma_free_frmr_page_list(struct ib_fast_reg_page_list *page_list);
 
 #endif				/* __OCRDMA_VERBS_H__ */

+ 4 - 1
drivers/infiniband/hw/qib/qib.h

@@ -89,7 +89,6 @@ struct qlogic_ib_stats {
 
 extern struct qlogic_ib_stats qib_stats;
 extern const struct pci_error_handlers qib_pci_err_handler;
-extern struct pci_driver qib_driver;
 
 #define QIB_CHIP_SWVERSION QIB_CHIP_VERS_MAJ
 /*
@@ -576,11 +575,13 @@ struct qib_pportdata {
 	/* read/write using lock */
 	spinlock_t            sdma_lock ____cacheline_aligned_in_smp;
 	struct list_head      sdma_activelist;
+	struct list_head      sdma_userpending;
 	u64                   sdma_descq_added;
 	u64                   sdma_descq_removed;
 	u16                   sdma_descq_tail;
 	u16                   sdma_descq_head;
 	u8                    sdma_generation;
+	u8                    sdma_intrequest;
 
 	struct tasklet_struct sdma_sw_clean_up_task
 		____cacheline_aligned_in_smp;
@@ -1326,6 +1327,8 @@ int qib_setup_sdma(struct qib_pportdata *);
 void qib_teardown_sdma(struct qib_pportdata *);
 void __qib_sdma_intr(struct qib_pportdata *);
 void qib_sdma_intr(struct qib_pportdata *);
+void qib_user_sdma_send_desc(struct qib_pportdata *dd,
+			struct list_head *pktlist);
 int qib_sdma_verbs_send(struct qib_pportdata *, struct qib_sge_state *,
 			u32, struct qib_verbs_txreq *);
 /* ppd->sdma_lock should be locked before calling this. */

+ 31 - 1
drivers/infiniband/hw/qib/qib_common.h

@@ -279,7 +279,7 @@ struct qib_base_info {
  * may not be implemented; the user code must deal with this if it
  * cares, or it must abort after initialization reports the difference.
  */
-#define QIB_USER_SWMINOR 12
+#define QIB_USER_SWMINOR 13
 
 #define QIB_USER_SWVERSION ((QIB_USER_SWMAJOR << 16) | QIB_USER_SWMINOR)
 
@@ -701,7 +701,37 @@ struct qib_message_header {
 	__be32 bth[3];
 	/* fields below this point are in host byte order */
 	struct qib_header iph;
+	/* fields below are simplified, but should match PSM */
+	/* some are accessed by driver when packet spliting is needed */
 	__u8 sub_opcode;
+	__u8 flags;
+	__u16 commidx;
+	__u32 ack_seq_num;
+	__u8 flowid;
+	__u8 hdr_dlen;
+	__u16 mqhdr;
+	__u32 uwords[4];
+};
+
+/* sequence number bits for message */
+union qib_seqnum {
+	struct {
+		__u32 seq:11;
+		__u32 gen:8;
+		__u32 flow:5;
+	};
+	struct {
+		__u32 pkt:16;
+		__u32 msg:8;
+	};
+	__u32 val;
+};
+
+/* qib receiving-dma tid-session-member */
+struct qib_tid_session_member {
+	__u16 tid;
+	__u16 offset;
+	__u16 length;
 };
 
 /* IB - LRH header consts */

+ 1 - 1
drivers/infiniband/hw/qib/qib_file_ops.c

@@ -1220,7 +1220,7 @@ static int qib_compatible_subctxts(int user_swmajor, int user_swminor)
 			return user_swminor == 3;
 		default:
 			/* >= 4 are compatible (or are expected to be) */
-			return user_swminor >= 4;
+			return user_swminor <= QIB_USER_SWMINOR;
 		}
 	}
 	/* make no promises yet for future major versions */

+ 1 - 1
drivers/infiniband/hw/qib/qib_init.c

@@ -1193,7 +1193,7 @@ static DEFINE_PCI_DEVICE_TABLE(qib_pci_tbl) = {
 
 MODULE_DEVICE_TABLE(pci, qib_pci_tbl);
 
-struct pci_driver qib_driver = {
+static struct pci_driver qib_driver = {
 	.name = QIB_DRV_NAME,
 	.probe = qib_init_one,
 	.remove = qib_remove_one,

+ 2 - 1
drivers/infiniband/hw/qib/qib_mad.h

@@ -415,7 +415,6 @@ struct cc_table_shadow {
 	struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX];
 } __packed;
 
-#endif				/* _QIB_MAD_H */
 /*
  * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
  * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
@@ -428,3 +427,5 @@ struct cc_table_shadow {
 		    COUNTER_MASK(1, 2) | \
 		    COUNTER_MASK(1, 3) | \
 		    COUNTER_MASK(1, 4))
+
+#endif				/* _QIB_MAD_H */

+ 5 - 5
drivers/infiniband/hw/qib/qib_pcie.c

@@ -283,12 +283,12 @@ int qib_pcie_params(struct qib_devdata *dd, u32 minw, u32 *nent,
 		goto bail;
 	}
 
-	pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_MSIX);
+	pos = dd->pcidev->msix_cap;
 	if (nent && *nent && pos) {
 		qib_msix_setup(dd, pos, nent, entry);
 		ret = 0; /* did it, either MSIx or INTx */
 	} else {
-		pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_MSI);
+		pos = dd->pcidev->msi_cap;
 		if (pos)
 			ret = qib_msi_setup(dd, pos);
 		else
@@ -357,7 +357,7 @@ int qib_reinit_intr(struct qib_devdata *dd)
 	if (!dd->msi_lo)
 		goto bail;
 
-	pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_MSI);
+	pos = dd->pcidev->msi_cap;
 	if (!pos) {
 		qib_dev_err(dd,
 			"Can't find MSI capability, can't restore MSI settings\n");
@@ -426,7 +426,7 @@ void qib_enable_intx(struct pci_dev *pdev)
 	if (new != cw)
 		pci_write_config_word(pdev, PCI_COMMAND, new);
 
-	pos = pci_find_capability(pdev, PCI_CAP_ID_MSI);
+	pos = pdev->msi_cap;
 	if (pos) {
 		/* then turn off MSI */
 		pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &cw);
@@ -434,7 +434,7 @@ void qib_enable_intx(struct pci_dev *pdev)
 		if (new != cw)
 			pci_write_config_word(pdev, pos + PCI_MSI_FLAGS, new);
 	}
-	pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
+	pos = pdev->msix_cap;
 	if (pos) {
 		/* then turn off MSIx */
 		pci_read_config_word(pdev, pos + PCI_MSIX_FLAGS, &cw);

+ 7 - 1
drivers/infiniband/hw/qib/qib_sdma.c

@@ -423,8 +423,11 @@ void qib_sdma_intr(struct qib_pportdata *ppd)
 
 void __qib_sdma_intr(struct qib_pportdata *ppd)
 {
-	if (__qib_sdma_running(ppd))
+	if (__qib_sdma_running(ppd)) {
 		qib_sdma_make_progress(ppd);
+		if (!list_empty(&ppd->sdma_userpending))
+			qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending);
+	}
 }
 
 int qib_setup_sdma(struct qib_pportdata *ppd)
@@ -452,6 +455,9 @@ int qib_setup_sdma(struct qib_pportdata *ppd)
 	ppd->sdma_descq_removed = 0;
 	ppd->sdma_descq_added = 0;
 
+	ppd->sdma_intrequest = 0;
+	INIT_LIST_HEAD(&ppd->sdma_userpending);
+
 	INIT_LIST_HEAD(&ppd->sdma_activelist);
 
 	tasklet_init(&ppd->sdma_sw_clean_up_task, sdma_sw_clean_up_task,

File diff suppressed because it is too large
+ 575 - 142
drivers/infiniband/hw/qib/qib_user_sdma.c


+ 0 - 3
drivers/infiniband/ulp/ipoib/ipoib_cm.c

@@ -817,7 +817,6 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 
 		if (neigh) {
 			neigh->cm = NULL;
-			list_del(&neigh->list);
 			ipoib_neigh_free(neigh);
 
 			tx->neigh = NULL;
@@ -1234,7 +1233,6 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
 
 		if (neigh) {
 			neigh->cm = NULL;
-			list_del(&neigh->list);
 			ipoib_neigh_free(neigh);
 
 			tx->neigh = NULL;
@@ -1325,7 +1323,6 @@ static void ipoib_cm_tx_start(struct work_struct *work)
 			neigh = p->neigh;
 			if (neigh) {
 				neigh->cm = NULL;
-				list_del(&neigh->list);
 				ipoib_neigh_free(neigh);
 			}
 			list_del(&p->list);

+ 3 - 6
drivers/infiniband/ulp/ipoib/ipoib_main.c

@@ -493,7 +493,6 @@ static void path_rec_completion(int status,
 									       path,
 									       neigh));
 				if (!ipoib_cm_get(neigh)) {
-					list_del(&neigh->list);
 					ipoib_neigh_free(neigh);
 					continue;
 				}
@@ -618,7 +617,6 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
 			if (!ipoib_cm_get(neigh))
 				ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
 			if (!ipoib_cm_get(neigh)) {
-				list_del(&neigh->list);
 				ipoib_neigh_free(neigh);
 				goto err_drop;
 			}
@@ -639,7 +637,7 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
 		neigh->ah  = NULL;
 
 		if (!path->query && path_rec_start(dev, path))
-			goto err_list;
+			goto err_path;
 
 		__skb_queue_tail(&neigh->queue, skb);
 	}
@@ -648,9 +646,6 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
 	ipoib_neigh_put(neigh);
 	return;
 
-err_list:
-	list_del(&neigh->list);
-
 err_path:
 	ipoib_neigh_free(neigh);
 err_drop:
@@ -1098,6 +1093,8 @@ void ipoib_neigh_free(struct ipoib_neigh *neigh)
 			rcu_assign_pointer(*np,
 					   rcu_dereference_protected(neigh->hnext,
 								     lockdep_is_held(&priv->lock)));
+			/* remove from parent list */
+			list_del(&neigh->list);
 			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
 			return;
 		} else {

+ 12 - 7
drivers/infiniband/ulp/iser/iscsi_iser.c

@@ -347,6 +347,7 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
 {
 	struct iscsi_conn *conn = cls_conn->dd_data;
 	struct iscsi_iser_conn *iser_conn;
+	struct iscsi_session *session;
 	struct iser_conn *ib_conn;
 	struct iscsi_endpoint *ep;
 	int error;
@@ -365,7 +366,8 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
 	}
 	ib_conn = ep->dd_data;
 
-	if (iser_alloc_rx_descriptors(ib_conn))
+	session = conn->session;
+	if (iser_alloc_rx_descriptors(ib_conn, session))
 		return -ENOMEM;
 
 	/* binds the iSER connection retrieved from the previously
@@ -419,12 +421,13 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 	struct iscsi_cls_session *cls_session;
 	struct iscsi_session *session;
 	struct Scsi_Host *shost;
-	struct iser_conn *ib_conn;
+	struct iser_conn *ib_conn = NULL;
 
 	shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0);
 	if (!shost)
 		return NULL;
 	shost->transportt = iscsi_iser_scsi_transport;
+	shost->cmd_per_lun = qdepth;
 	shost->max_lun = iscsi_max_lun;
 	shost->max_id = 0;
 	shost->max_channel = 0;
@@ -441,12 +444,14 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
 			   ep ? ib_conn->device->ib_device->dma_device : NULL))
 		goto free_host;
 
-	/*
-	 * we do not support setting can_queue cmd_per_lun from userspace yet
-	 * because we preallocate so many resources
-	 */
+	if (cmds_max > ISER_DEF_XMIT_CMDS_MAX) {
+		iser_info("cmds_max changed from %u to %u\n",
+			  cmds_max, ISER_DEF_XMIT_CMDS_MAX);
+		cmds_max = ISER_DEF_XMIT_CMDS_MAX;
+	}
+
 	cls_session = iscsi_session_setup(&iscsi_iser_transport, shost,
-					  ISCSI_DEF_XMIT_CMDS_MAX, 0,
+					  cmds_max, 0,
 					  sizeof(struct iscsi_iser_task),
 					  initial_cmdsn, 0);
 	if (!cls_session)

+ 59 - 14
drivers/infiniband/ulp/iser/iscsi_iser.h

@@ -78,14 +78,14 @@
 
 #define iser_warn(fmt, arg...)				\
 	do {						\
-		if (iser_debug_level > 1)		\
+		if (iser_debug_level > 0)		\
 			pr_warn(PFX "%s:" fmt,          \
 				__func__ , ## arg);	\
 	} while (0)
 
 #define iser_info(fmt, arg...)				\
 	do {						\
-		if (iser_debug_level > 0)		\
+		if (iser_debug_level > 1)		\
 			pr_info(PFX "%s:" fmt,          \
 				__func__ , ## arg);	\
 	} while (0)
@@ -102,7 +102,13 @@
 
 					/* support up to 512KB in one RDMA */
 #define ISCSI_ISER_SG_TABLESIZE         (0x80000 >> SHIFT_4K)
-#define ISER_DEF_CMD_PER_LUN		ISCSI_DEF_XMIT_CMDS_MAX
+#define ISER_DEF_XMIT_CMDS_DEFAULT		512
+#if ISCSI_DEF_XMIT_CMDS_MAX > ISER_DEF_XMIT_CMDS_DEFAULT
+	#define ISER_DEF_XMIT_CMDS_MAX		ISCSI_DEF_XMIT_CMDS_MAX
+#else
+	#define ISER_DEF_XMIT_CMDS_MAX		ISER_DEF_XMIT_CMDS_DEFAULT
+#endif
+#define ISER_DEF_CMD_PER_LUN		ISER_DEF_XMIT_CMDS_MAX
 
 /* QP settings */
 /* Maximal bounds on received asynchronous PDUs */
@@ -111,9 +117,9 @@
 #define ISER_MAX_TX_MISC_PDUS		6 /* NOOP_OUT(2), TEXT(1),         *
 					   * SCSI_TMFUNC(2), LOGOUT(1) */
 
-#define ISER_QP_MAX_RECV_DTOS		(ISCSI_DEF_XMIT_CMDS_MAX)
+#define ISER_QP_MAX_RECV_DTOS		(ISER_DEF_XMIT_CMDS_MAX)
 
-#define ISER_MIN_POSTED_RX		(ISCSI_DEF_XMIT_CMDS_MAX >> 2)
+#define ISER_MIN_POSTED_RX		(ISER_DEF_XMIT_CMDS_MAX >> 2)
 
 /* the max TX (send) WR supported by the iSER QP is defined by                 *
  * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   *
@@ -123,7 +129,7 @@
 
 #define ISER_INFLIGHT_DATAOUTS		8
 
-#define ISER_QP_MAX_REQ_DTOS		(ISCSI_DEF_XMIT_CMDS_MAX *    \
+#define ISER_QP_MAX_REQ_DTOS		(ISER_DEF_XMIT_CMDS_MAX *    \
 					(1 + ISER_INFLIGHT_DATAOUTS) + \
 					ISER_MAX_TX_MISC_PDUS        + \
 					ISER_MAX_RX_MISC_PDUS)
@@ -205,7 +211,7 @@ struct iser_mem_reg {
 	u64  va;
 	u64  len;
 	void *mem_h;
-	int  is_fmr;
+	int  is_mr;
 };
 
 struct iser_regd_buf {
@@ -246,6 +252,9 @@ struct iser_rx_desc {
 
 #define ISER_MAX_CQ 4
 
+struct iser_conn;
+struct iscsi_iser_task;
+
 struct iser_device {
 	struct ib_device             *ib_device;
 	struct ib_pd	             *pd;
@@ -259,6 +268,22 @@ struct iser_device {
 	int                          cq_active_qps[ISER_MAX_CQ];
 	int			     cqs_used;
 	struct iser_cq_desc	     *cq_desc;
+	int                          (*iser_alloc_rdma_reg_res)(struct iser_conn *ib_conn,
+								unsigned cmds_max);
+	void                         (*iser_free_rdma_reg_res)(struct iser_conn *ib_conn);
+	int                          (*iser_reg_rdma_mem)(struct iscsi_iser_task *iser_task,
+							  enum iser_data_dir cmd_dir);
+	void                         (*iser_unreg_rdma_mem)(struct iscsi_iser_task *iser_task,
+							    enum iser_data_dir cmd_dir);
+};
+
+struct fast_reg_descriptor {
+	struct list_head		  list;
+	/* For fast registration - FRWR */
+	struct ib_mr			 *data_mr;
+	struct ib_fast_reg_page_list     *data_frpl;
+	/* Valid for fast registration flag */
+	bool				  valid;
 };
 
 struct iser_conn {
@@ -270,13 +295,13 @@ struct iser_conn {
 	struct iser_device           *device;       /* device context          */
 	struct rdma_cm_id            *cma_id;       /* CMA ID		       */
 	struct ib_qp	             *qp;           /* QP 		       */
-	struct ib_fmr_pool           *fmr_pool;     /* pool of IB FMRs         */
 	wait_queue_head_t	     wait;          /* waitq for conn/disconn  */
+	unsigned		     qp_max_recv_dtos; /* num of rx buffers */
+	unsigned		     qp_max_recv_dtos_mask; /* above minus 1 */
+	unsigned		     min_posted_rx; /* qp_max_recv_dtos >> 2 */
 	int                          post_recv_buf_count; /* posted rx count  */
 	atomic_t                     post_send_buf_count; /* posted tx count   */
 	char 			     name[ISER_OBJECT_NAME_SIZE];
-	struct iser_page_vec         *page_vec;     /* represents SG to fmr maps*
-						     * maps serialized as tx is*/
 	struct list_head	     conn_list;       /* entry in ig conn list */
 
 	char  			     *login_buf;
@@ -285,6 +310,17 @@ struct iser_conn {
 	unsigned int 		     rx_desc_head;
 	struct iser_rx_desc	     *rx_descs;
 	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX];
+	union {
+		struct {
+			struct ib_fmr_pool      *pool;	   /* pool of IB FMRs         */
+			struct iser_page_vec	*page_vec; /* represents SG to fmr maps*
+							    * maps serialized as tx is*/
+		} fmr;
+		struct {
+			struct list_head	pool;
+			int			pool_size;
+		} frwr;
+	} fastreg;
 };
 
 struct iscsi_iser_conn {
@@ -368,8 +404,10 @@ void iser_free_rx_descriptors(struct iser_conn *ib_conn);
 void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *task,
 				     enum iser_data_dir         cmd_dir);
 
-int  iser_reg_rdma_mem(struct iscsi_iser_task *task,
-		       enum   iser_data_dir        cmd_dir);
+int  iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task,
+			   enum iser_data_dir cmd_dir);
+int  iser_reg_rdma_mem_frwr(struct iscsi_iser_task *task,
+			    enum iser_data_dir cmd_dir);
 
 int  iser_connect(struct iser_conn   *ib_conn,
 		  struct sockaddr_in *src_addr,
@@ -380,7 +418,10 @@ int  iser_reg_page_vec(struct iser_conn     *ib_conn,
 		       struct iser_page_vec *page_vec,
 		       struct iser_mem_reg  *mem_reg);
 
-void iser_unreg_mem(struct iser_mem_reg *mem_reg);
+void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
+			enum iser_data_dir cmd_dir);
+void iser_unreg_mem_frwr(struct iscsi_iser_task *iser_task,
+			 enum iser_data_dir cmd_dir);
 
 int  iser_post_recvl(struct iser_conn *ib_conn);
 int  iser_post_recvm(struct iser_conn *ib_conn, int count);
@@ -394,5 +435,9 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
 void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task);
 int  iser_initialize_task_headers(struct iscsi_task *task,
 			struct iser_tx_desc *tx_desc);
-int iser_alloc_rx_descriptors(struct iser_conn *ib_conn);
+int iser_alloc_rx_descriptors(struct iser_conn *ib_conn, struct iscsi_session *session);
+int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max);
+void iser_free_fmr_pool(struct iser_conn *ib_conn);
+int iser_create_frwr_pool(struct iser_conn *ib_conn, unsigned cmds_max);
+void iser_free_frwr_pool(struct iser_conn *ib_conn);
 #endif

+ 114 - 25
drivers/infiniband/ulp/iser/iser_initiator.c

@@ -49,6 +49,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task,
 
 {
 	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_device  *device = iser_task->iser_conn->ib_conn->device;
 	struct iser_regd_buf *regd_buf;
 	int err;
 	struct iser_hdr *hdr = &iser_task->desc.iser_header;
@@ -69,7 +70,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task,
 		return -EINVAL;
 	}
 
-	err = iser_reg_rdma_mem(iser_task,ISER_DIR_IN);
+	err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_IN);
 	if (err) {
 		iser_err("Failed to set up Data-IN RDMA\n");
 		return err;
@@ -98,6 +99,7 @@ iser_prepare_write_cmd(struct iscsi_task *task,
 		       unsigned int edtl)
 {
 	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_device  *device = iser_task->iser_conn->ib_conn->device;
 	struct iser_regd_buf *regd_buf;
 	int err;
 	struct iser_hdr *hdr = &iser_task->desc.iser_header;
@@ -119,7 +121,7 @@ iser_prepare_write_cmd(struct iscsi_task *task,
 		return -EINVAL;
 	}
 
-	err = iser_reg_rdma_mem(iser_task,ISER_DIR_OUT);
+	err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_OUT);
 	if (err != 0) {
 		iser_err("Failed to register write cmd RDMA mem\n");
 		return err;
@@ -170,8 +172,78 @@ static void iser_create_send_desc(struct iser_conn	*ib_conn,
 	}
 }
 
+static void iser_free_login_buf(struct iser_conn *ib_conn)
+{
+	if (!ib_conn->login_buf)
+		return;
+
+	if (ib_conn->login_req_dma)
+		ib_dma_unmap_single(ib_conn->device->ib_device,
+				    ib_conn->login_req_dma,
+				    ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
+
+	if (ib_conn->login_resp_dma)
+		ib_dma_unmap_single(ib_conn->device->ib_device,
+				    ib_conn->login_resp_dma,
+				    ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
+
+	kfree(ib_conn->login_buf);
+
+	/* make sure we never redo any unmapping */
+	ib_conn->login_req_dma = 0;
+	ib_conn->login_resp_dma = 0;
+	ib_conn->login_buf = NULL;
+}
+
+static int iser_alloc_login_buf(struct iser_conn *ib_conn)
+{
+	struct iser_device	*device;
+	int			req_err, resp_err;
+
+	BUG_ON(ib_conn->device == NULL);
+
+	device = ib_conn->device;
+
+	ib_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN +
+				     ISER_RX_LOGIN_SIZE, GFP_KERNEL);
+	if (!ib_conn->login_buf)
+		goto out_err;
+
+	ib_conn->login_req_buf  = ib_conn->login_buf;
+	ib_conn->login_resp_buf = ib_conn->login_buf +
+						ISCSI_DEF_MAX_RECV_SEG_LEN;
+
+	ib_conn->login_req_dma = ib_dma_map_single(ib_conn->device->ib_device,
+				(void *)ib_conn->login_req_buf,
+				ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
+
+	ib_conn->login_resp_dma = ib_dma_map_single(ib_conn->device->ib_device,
+				(void *)ib_conn->login_resp_buf,
+				ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
+
+	req_err  = ib_dma_mapping_error(device->ib_device,
+					ib_conn->login_req_dma);
+	resp_err = ib_dma_mapping_error(device->ib_device,
+					ib_conn->login_resp_dma);
 
-int iser_alloc_rx_descriptors(struct iser_conn *ib_conn)
+	if (req_err || resp_err) {
+		if (req_err)
+			ib_conn->login_req_dma = 0;
+		if (resp_err)
+			ib_conn->login_resp_dma = 0;
+		goto free_login_buf;
+	}
+	return 0;
+
+free_login_buf:
+	iser_free_login_buf(ib_conn);
+
+out_err:
+	iser_err("unable to alloc or map login buf\n");
+	return -ENOMEM;
+}
+
+int iser_alloc_rx_descriptors(struct iser_conn *ib_conn, struct iscsi_session *session)
 {
 	int i, j;
 	u64 dma_addr;
@@ -179,14 +251,24 @@ int iser_alloc_rx_descriptors(struct iser_conn *ib_conn)
 	struct ib_sge       *rx_sg;
 	struct iser_device  *device = ib_conn->device;
 
-	ib_conn->rx_descs = kmalloc(ISER_QP_MAX_RECV_DTOS *
+	ib_conn->qp_max_recv_dtos = session->cmds_max;
+	ib_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */
+	ib_conn->min_posted_rx = ib_conn->qp_max_recv_dtos >> 2;
+
+	if (device->iser_alloc_rdma_reg_res(ib_conn, session->scsi_cmds_max))
+		goto create_rdma_reg_res_failed;
+
+	if (iser_alloc_login_buf(ib_conn))
+		goto alloc_login_buf_fail;
+
+	ib_conn->rx_descs = kmalloc(session->cmds_max *
 				sizeof(struct iser_rx_desc), GFP_KERNEL);
 	if (!ib_conn->rx_descs)
 		goto rx_desc_alloc_fail;
 
 	rx_desc = ib_conn->rx_descs;
 
-	for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++)  {
+	for (i = 0; i < ib_conn->qp_max_recv_dtos; i++, rx_desc++)  {
 		dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc,
 					ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
 		if (ib_dma_mapping_error(device->ib_device, dma_addr))
@@ -207,10 +289,14 @@ rx_desc_dma_map_failed:
 	rx_desc = ib_conn->rx_descs;
 	for (j = 0; j < i; j++, rx_desc++)
 		ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
-			ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
 	kfree(ib_conn->rx_descs);
 	ib_conn->rx_descs = NULL;
 rx_desc_alloc_fail:
+	iser_free_login_buf(ib_conn);
+alloc_login_buf_fail:
+	device->iser_free_rdma_reg_res(ib_conn);
+create_rdma_reg_res_failed:
 	iser_err("failed allocating rx descriptors / data buffers\n");
 	return -ENOMEM;
 }
@@ -222,13 +308,21 @@ void iser_free_rx_descriptors(struct iser_conn *ib_conn)
 	struct iser_device *device = ib_conn->device;
 
 	if (!ib_conn->rx_descs)
-		return;
+		goto free_login_buf;
+
+	if (device->iser_free_rdma_reg_res)
+		device->iser_free_rdma_reg_res(ib_conn);
 
 	rx_desc = ib_conn->rx_descs;
-	for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++)
+	for (i = 0; i < ib_conn->qp_max_recv_dtos; i++, rx_desc++)
 		ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
-			ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
 	kfree(ib_conn->rx_descs);
+	/* make sure we never redo any unmapping */
+	ib_conn->rx_descs = NULL;
+
+free_login_buf:
+	iser_free_login_buf(ib_conn);
 }
 
 static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req)
@@ -248,9 +342,10 @@ static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req)
 	WARN_ON(iser_conn->ib_conn->post_recv_buf_count != 1);
 	WARN_ON(atomic_read(&iser_conn->ib_conn->post_send_buf_count) != 0);
 
-	iser_dbg("Initially post: %d\n", ISER_MIN_POSTED_RX);
+	iser_dbg("Initially post: %d\n", iser_conn->ib_conn->min_posted_rx);
 	/* Initial post receive buffers */
-	if (iser_post_recvm(iser_conn->ib_conn, ISER_MIN_POSTED_RX))
+	if (iser_post_recvm(iser_conn->ib_conn,
+			    iser_conn->ib_conn->min_posted_rx))
 		return -ENOMEM;
 
 	return 0;
@@ -487,9 +582,9 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc,
 		return;
 
 	outstanding = ib_conn->post_recv_buf_count;
-	if (outstanding + ISER_MIN_POSTED_RX <= ISER_QP_MAX_RECV_DTOS) {
-		count = min(ISER_QP_MAX_RECV_DTOS - outstanding,
-						ISER_MIN_POSTED_RX);
+	if (outstanding + ib_conn->min_posted_rx <= ib_conn->qp_max_recv_dtos) {
+		count = min(ib_conn->qp_max_recv_dtos - outstanding,
+						ib_conn->min_posted_rx);
 		err = iser_post_recvm(ib_conn, count);
 		if (err)
 			iser_err("posting %d rx bufs err %d\n", count, err);
@@ -538,8 +633,8 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
 
 void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
 {
+	struct iser_device *device = iser_task->iser_conn->ib_conn->device;
 	int is_rdma_aligned = 1;
-	struct iser_regd_buf *regd;
 
 	/* if we were reading, copy back to unaligned sglist,
 	 * anyway dma_unmap and free the copy
@@ -553,17 +648,11 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
 		iser_finalize_rdma_unaligned_sg(iser_task, ISER_DIR_OUT);
 	}
 
-	if (iser_task->dir[ISER_DIR_IN]) {
-		regd = &iser_task->rdma_regd[ISER_DIR_IN];
-		if (regd->reg.is_fmr)
-			iser_unreg_mem(&regd->reg);
-	}
+	if (iser_task->dir[ISER_DIR_IN])
+		device->iser_unreg_rdma_mem(iser_task, ISER_DIR_IN);
 
-	if (iser_task->dir[ISER_DIR_OUT]) {
-		regd = &iser_task->rdma_regd[ISER_DIR_OUT];
-		if (regd->reg.is_fmr)
-			iser_unreg_mem(&regd->reg);
-	}
+	if (iser_task->dir[ISER_DIR_OUT])
+		device->iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT);
 
        /* if the data was unaligned, it was already unmapped and then copied */
        if (is_rdma_aligned)

+ 193 - 38
drivers/infiniband/ulp/iser/iser_memory.c

@@ -170,8 +170,8 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
  */
 
 static int iser_sg_to_page_vec(struct iser_data_buf *data,
-			       struct iser_page_vec *page_vec,
-			       struct ib_device *ibdev)
+			       struct ib_device *ibdev, u64 *pages,
+			       int *offset, int *data_size)
 {
 	struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf;
 	u64 start_addr, end_addr, page, chunk_start = 0;
@@ -180,7 +180,7 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data,
 	int i, new_chunk, cur_page, last_ent = data->dma_nents - 1;
 
 	/* compute the offset of first element */
-	page_vec->offset = (u64) sgl[0].offset & ~MASK_4K;
+	*offset = (u64) sgl[0].offset & ~MASK_4K;
 
 	new_chunk = 1;
 	cur_page  = 0;
@@ -204,13 +204,14 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data,
 		   which might be unaligned */
 		page = chunk_start & MASK_4K;
 		do {
-			page_vec->pages[cur_page++] = page;
+			pages[cur_page++] = page;
 			page += SIZE_4K;
 		} while (page < end_addr);
 	}
 
-	page_vec->data_size = total_sz;
-	iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page);
+	*data_size = total_sz;
+	iser_dbg("page_vec->data_size:%d cur_page %d\n",
+		 *data_size, cur_page);
 	return cur_page;
 }
 
@@ -267,11 +268,8 @@ static void iser_data_buf_dump(struct iser_data_buf *data,
 	struct scatterlist *sg;
 	int i;
 
-	if (iser_debug_level == 0)
-		return;
-
 	for_each_sg(sgl, sg, data->dma_nents, i)
-		iser_warn("sg[%d] dma_addr:0x%lX page:0x%p "
+		iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p "
 			 "off:0x%x sz:0x%x dma_len:0x%x\n",
 			 i, (unsigned long)ib_sg_dma_address(ibdev, sg),
 			 sg_page(sg), sg->offset,
@@ -298,8 +296,10 @@ static void iser_page_vec_build(struct iser_data_buf *data,
 	page_vec->offset = 0;
 
 	iser_dbg("Translating sg sz: %d\n", data->dma_nents);
-	page_vec_len = iser_sg_to_page_vec(data, page_vec, ibdev);
-	iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents,page_vec_len);
+	page_vec_len = iser_sg_to_page_vec(data, ibdev, page_vec->pages,
+					   &page_vec->offset,
+					   &page_vec->data_size);
+	iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents, page_vec_len);
 
 	page_vec->length = page_vec_len;
 
@@ -347,16 +347,41 @@ void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task)
 	}
 }
 
+static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task,
+			      struct ib_device *ibdev,
+			      enum iser_data_dir cmd_dir,
+			      int aligned_len)
+{
+	struct iscsi_conn    *iscsi_conn = iser_task->iser_conn->iscsi_conn;
+	struct iser_data_buf *mem = &iser_task->data[cmd_dir];
+
+	iscsi_conn->fmr_unalign_cnt++;
+	iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n",
+		  aligned_len, mem->size);
+
+	if (iser_debug_level > 0)
+		iser_data_buf_dump(mem, ibdev);
+
+	/* unmap the command data before accessing it */
+	iser_dma_unmap_task_data(iser_task);
+
+	/* allocate copy buf, if we are writing, copy the */
+	/* unaligned scatterlist, dma map the copy        */
+	if (iser_start_rdma_unaligned_sg(iser_task, cmd_dir) != 0)
+			return -ENOMEM;
+
+	return 0;
+}
+
 /**
- * iser_reg_rdma_mem - Registers memory intended for RDMA,
- * obtaining rkey and va
+ * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA,
+ * using FMR (if possible) obtaining rkey and va
  *
  * returns 0 on success, errno code on failure
  */
-int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task,
-		      enum   iser_data_dir        cmd_dir)
+int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
+			  enum iser_data_dir cmd_dir)
 {
-	struct iscsi_conn    *iscsi_conn = iser_task->iser_conn->iscsi_conn;
 	struct iser_conn     *ib_conn = iser_task->iser_conn->ib_conn;
 	struct iser_device   *device = ib_conn->device;
 	struct ib_device     *ibdev = device->ib_device;
@@ -370,20 +395,13 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task,
 	regd_buf = &iser_task->rdma_regd[cmd_dir];
 
 	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
-	if (aligned_len != mem->dma_nents ||
-	    (!ib_conn->fmr_pool && mem->dma_nents > 1)) {
-		iscsi_conn->fmr_unalign_cnt++;
-		iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n",
-			  aligned_len, mem->size);
-		iser_data_buf_dump(mem, ibdev);
-
-		/* unmap the command data before accessing it */
-		iser_dma_unmap_task_data(iser_task);
-
-		/* allocate copy buf, if we are writing, copy the */
-		/* unaligned scatterlist, dma map the copy        */
-		if (iser_start_rdma_unaligned_sg(iser_task, cmd_dir) != 0)
-				return -ENOMEM;
+	if (aligned_len != mem->dma_nents) {
+		err = fall_to_bounce_buf(iser_task, ibdev,
+					 cmd_dir, aligned_len);
+		if (err) {
+			iser_err("failed to allocate bounce buffer\n");
+			return err;
+		}
 		mem = &iser_task->data_copy[cmd_dir];
 	}
 
@@ -395,7 +413,7 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task,
 		regd_buf->reg.rkey = device->mr->rkey;
 		regd_buf->reg.len  = ib_sg_dma_len(ibdev, &sg[0]);
 		regd_buf->reg.va   = ib_sg_dma_address(ibdev, &sg[0]);
-		regd_buf->reg.is_fmr = 0;
+		regd_buf->reg.is_mr = 0;
 
 		iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X  "
 			 "va: 0x%08lX sz: %ld]\n",
@@ -404,22 +422,159 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task,
 			 (unsigned long)regd_buf->reg.va,
 			 (unsigned long)regd_buf->reg.len);
 	} else { /* use FMR for multiple dma entries */
-		iser_page_vec_build(mem, ib_conn->page_vec, ibdev);
-		err = iser_reg_page_vec(ib_conn, ib_conn->page_vec, &regd_buf->reg);
+		iser_page_vec_build(mem, ib_conn->fastreg.fmr.page_vec, ibdev);
+		err = iser_reg_page_vec(ib_conn, ib_conn->fastreg.fmr.page_vec,
+					&regd_buf->reg);
 		if (err && err != -EAGAIN) {
 			iser_data_buf_dump(mem, ibdev);
 			iser_err("mem->dma_nents = %d (dlength = 0x%x)\n",
 				 mem->dma_nents,
 				 ntoh24(iser_task->desc.iscsi_header.dlength));
 			iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n",
-				 ib_conn->page_vec->data_size, ib_conn->page_vec->length,
-				 ib_conn->page_vec->offset);
-			for (i=0 ; i<ib_conn->page_vec->length ; i++)
+				 ib_conn->fastreg.fmr.page_vec->data_size,
+				 ib_conn->fastreg.fmr.page_vec->length,
+				 ib_conn->fastreg.fmr.page_vec->offset);
+			for (i = 0; i < ib_conn->fastreg.fmr.page_vec->length; i++)
 				iser_err("page_vec[%d] = 0x%llx\n", i,
-					 (unsigned long long) ib_conn->page_vec->pages[i]);
+					 (unsigned long long) ib_conn->fastreg.fmr.page_vec->pages[i]);
 		}
 		if (err)
 			return err;
 	}
 	return 0;
 }
+
+static int iser_fast_reg_mr(struct fast_reg_descriptor *desc,
+			    struct iser_conn *ib_conn,
+			    struct iser_regd_buf *regd_buf,
+			    u32 offset, unsigned int data_size,
+			    unsigned int page_list_len)
+{
+	struct ib_send_wr fastreg_wr, inv_wr;
+	struct ib_send_wr *bad_wr, *wr = NULL;
+	u8 key;
+	int ret;
+
+	if (!desc->valid) {
+		memset(&inv_wr, 0, sizeof(inv_wr));
+		inv_wr.opcode = IB_WR_LOCAL_INV;
+		inv_wr.send_flags = IB_SEND_SIGNALED;
+		inv_wr.ex.invalidate_rkey = desc->data_mr->rkey;
+		wr = &inv_wr;
+		/* Bump the key */
+		key = (u8)(desc->data_mr->rkey & 0x000000FF);
+		ib_update_fast_reg_key(desc->data_mr, ++key);
+	}
+
+	/* Prepare FASTREG WR */
+	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+	fastreg_wr.send_flags = IB_SEND_SIGNALED;
+	fastreg_wr.wr.fast_reg.iova_start = desc->data_frpl->page_list[0] + offset;
+	fastreg_wr.wr.fast_reg.page_list = desc->data_frpl;
+	fastreg_wr.wr.fast_reg.page_list_len = page_list_len;
+	fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K;
+	fastreg_wr.wr.fast_reg.length = data_size;
+	fastreg_wr.wr.fast_reg.rkey = desc->data_mr->rkey;
+	fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE  |
+					       IB_ACCESS_REMOTE_WRITE |
+					       IB_ACCESS_REMOTE_READ);
+
+	if (!wr) {
+		wr = &fastreg_wr;
+		atomic_inc(&ib_conn->post_send_buf_count);
+	} else {
+		wr->next = &fastreg_wr;
+		atomic_add(2, &ib_conn->post_send_buf_count);
+	}
+
+	ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
+	if (ret) {
+		if (bad_wr->next)
+			atomic_sub(2, &ib_conn->post_send_buf_count);
+		else
+			atomic_dec(&ib_conn->post_send_buf_count);
+		iser_err("fast registration failed, ret:%d\n", ret);
+		return ret;
+	}
+	desc->valid = false;
+
+	regd_buf->reg.mem_h = desc;
+	regd_buf->reg.lkey = desc->data_mr->lkey;
+	regd_buf->reg.rkey = desc->data_mr->rkey;
+	regd_buf->reg.va = desc->data_frpl->page_list[0] + offset;
+	regd_buf->reg.len = data_size;
+	regd_buf->reg.is_mr = 1;
+
+	return ret;
+}
+
+/**
+ * iser_reg_rdma_mem_frwr - Registers memory intended for RDMA,
+ * using Fast Registration WR (if possible) obtaining rkey and va
+ *
+ * returns 0 on success, errno code on failure
+ */
+int iser_reg_rdma_mem_frwr(struct iscsi_iser_task *iser_task,
+			   enum iser_data_dir cmd_dir)
+{
+	struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+	struct ib_device *ibdev = device->ib_device;
+	struct iser_data_buf *mem = &iser_task->data[cmd_dir];
+	struct iser_regd_buf *regd_buf = &iser_task->rdma_regd[cmd_dir];
+	struct fast_reg_descriptor *desc;
+	unsigned int data_size, page_list_len;
+	int err, aligned_len;
+	unsigned long flags;
+	u32 offset;
+
+	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
+	if (aligned_len != mem->dma_nents) {
+		err = fall_to_bounce_buf(iser_task, ibdev,
+					 cmd_dir, aligned_len);
+		if (err) {
+			iser_err("failed to allocate bounce buffer\n");
+			return err;
+		}
+		mem = &iser_task->data_copy[cmd_dir];
+	}
+
+	/* if there a single dma entry, dma mr suffices */
+	if (mem->dma_nents == 1) {
+		struct scatterlist *sg = (struct scatterlist *)mem->buf;
+
+		regd_buf->reg.lkey = device->mr->lkey;
+		regd_buf->reg.rkey = device->mr->rkey;
+		regd_buf->reg.len  = ib_sg_dma_len(ibdev, &sg[0]);
+		regd_buf->reg.va   = ib_sg_dma_address(ibdev, &sg[0]);
+		regd_buf->reg.is_mr = 0;
+	} else {
+		spin_lock_irqsave(&ib_conn->lock, flags);
+		desc = list_first_entry(&ib_conn->fastreg.frwr.pool,
+					struct fast_reg_descriptor, list);
+		list_del(&desc->list);
+		spin_unlock_irqrestore(&ib_conn->lock, flags);
+		page_list_len = iser_sg_to_page_vec(mem, device->ib_device,
+						    desc->data_frpl->page_list,
+						    &offset, &data_size);
+
+		if (page_list_len * SIZE_4K < data_size) {
+			iser_err("fast reg page_list too short to hold this SG\n");
+			err = -EINVAL;
+			goto err_reg;
+		}
+
+		err = iser_fast_reg_mr(desc, ib_conn, regd_buf,
+				       offset, data_size, page_list_len);
+		if (err)
+			goto err_reg;
+	}
+
+	return 0;
+err_reg:
+	spin_lock_irqsave(&ib_conn->lock, flags);
+	list_add_tail(&desc->list, &ib_conn->fastreg.frwr.pool);
+	spin_unlock_irqrestore(&ib_conn->lock, flags);
+	return err;
+}

+ 211 - 81
drivers/infiniband/ulp/iser/iser_verbs.c

@@ -73,6 +73,36 @@ static int iser_create_device_ib_res(struct iser_device *device)
 {
 	int i, j;
 	struct iser_cq_desc *cq_desc;
+	struct ib_device_attr *dev_attr;
+
+	dev_attr = kmalloc(sizeof(*dev_attr), GFP_KERNEL);
+	if (!dev_attr)
+		return -ENOMEM;
+
+	if (ib_query_device(device->ib_device, dev_attr)) {
+		pr_warn("Query device failed for %s\n", device->ib_device->name);
+		goto dev_attr_err;
+	}
+
+	/* Assign function handles  - based on FMR support */
+	if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr &&
+	    device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) {
+		iser_info("FMR supported, using FMR for registration\n");
+		device->iser_alloc_rdma_reg_res = iser_create_fmr_pool;
+		device->iser_free_rdma_reg_res = iser_free_fmr_pool;
+		device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr;
+		device->iser_unreg_rdma_mem = iser_unreg_mem_fmr;
+	} else
+	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+		iser_info("FRWR supported, using FRWR for registration\n");
+		device->iser_alloc_rdma_reg_res = iser_create_frwr_pool;
+		device->iser_free_rdma_reg_res = iser_free_frwr_pool;
+		device->iser_reg_rdma_mem = iser_reg_rdma_mem_frwr;
+		device->iser_unreg_rdma_mem = iser_unreg_mem_frwr;
+	} else {
+		iser_err("IB device does not support FMRs nor FRWRs, can't register memory\n");
+		goto dev_attr_err;
+	}
 
 	device->cqs_used = min(ISER_MAX_CQ, device->ib_device->num_comp_vectors);
 	iser_info("using %d CQs, device %s supports %d vectors\n",
@@ -128,6 +158,7 @@ static int iser_create_device_ib_res(struct iser_device *device)
 	if (ib_register_event_handler(&device->event_handler))
 		goto handler_err;
 
+	kfree(dev_attr);
 	return 0;
 
 handler_err:
@@ -147,6 +178,8 @@ pd_err:
 	kfree(device->cq_desc);
 cq_desc_err:
 	iser_err("failed to allocate an IB resource\n");
+dev_attr_err:
+	kfree(dev_attr);
 	return -1;
 }
 
@@ -178,56 +211,23 @@ static void iser_free_device_ib_res(struct iser_device *device)
 }
 
 /**
- * iser_create_ib_conn_res - Creates FMR pool and Queue-Pair (QP)
+ * iser_create_fmr_pool - Creates FMR pool and page_vector
  *
- * returns 0 on success, -1 on failure
+ * returns 0 on success, or errno code on failure
  */
-static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
+int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max)
 {
-	struct iser_device	*device;
-	struct ib_qp_init_attr	init_attr;
-	int			req_err, resp_err, ret = -ENOMEM;
+	struct iser_device *device = ib_conn->device;
 	struct ib_fmr_pool_param params;
-	int index, min_index = 0;
-
-	BUG_ON(ib_conn->device == NULL);
-
-	device = ib_conn->device;
-
-	ib_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN +
-					ISER_RX_LOGIN_SIZE, GFP_KERNEL);
-	if (!ib_conn->login_buf)
-		goto out_err;
-
-	ib_conn->login_req_buf  = ib_conn->login_buf;
-	ib_conn->login_resp_buf = ib_conn->login_buf + ISCSI_DEF_MAX_RECV_SEG_LEN;
-
-	ib_conn->login_req_dma = ib_dma_map_single(ib_conn->device->ib_device,
-				(void *)ib_conn->login_req_buf,
-				ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
-
-	ib_conn->login_resp_dma = ib_dma_map_single(ib_conn->device->ib_device,
-				(void *)ib_conn->login_resp_buf,
-				ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
-
-	req_err  = ib_dma_mapping_error(device->ib_device, ib_conn->login_req_dma);
-	resp_err = ib_dma_mapping_error(device->ib_device, ib_conn->login_resp_dma);
-
-	if (req_err || resp_err) {
-		if (req_err)
-			ib_conn->login_req_dma = 0;
-		if (resp_err)
-			ib_conn->login_resp_dma = 0;
-		goto out_err;
-	}
+	int ret = -ENOMEM;
 
-	ib_conn->page_vec = kmalloc(sizeof(struct iser_page_vec) +
-				    (sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE +1)),
-				    GFP_KERNEL);
-	if (!ib_conn->page_vec)
-		goto out_err;
+	ib_conn->fastreg.fmr.page_vec = kmalloc(sizeof(struct iser_page_vec) +
+						(sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)),
+						GFP_KERNEL);
+	if (!ib_conn->fastreg.fmr.page_vec)
+		return ret;
 
-	ib_conn->page_vec->pages = (u64 *) (ib_conn->page_vec + 1);
+	ib_conn->fastreg.fmr.page_vec->pages = (u64 *)(ib_conn->fastreg.fmr.page_vec + 1);
 
 	params.page_shift        = SHIFT_4K;
 	/* when the first/last SG element are not start/end *
@@ -235,24 +235,143 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
 	params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1;
 	/* make the pool size twice the max number of SCSI commands *
 	 * the ML is expected to queue, watermark for unmap at 50%  */
-	params.pool_size	 = ISCSI_DEF_XMIT_CMDS_MAX * 2;
-	params.dirty_watermark	 = ISCSI_DEF_XMIT_CMDS_MAX;
+	params.pool_size	 = cmds_max * 2;
+	params.dirty_watermark	 = cmds_max;
 	params.cache		 = 0;
 	params.flush_function	 = NULL;
 	params.access		 = (IB_ACCESS_LOCAL_WRITE  |
 				    IB_ACCESS_REMOTE_WRITE |
 				    IB_ACCESS_REMOTE_READ);
 
-	ib_conn->fmr_pool = ib_create_fmr_pool(device->pd, &params);
-	ret = PTR_ERR(ib_conn->fmr_pool);
-	if (IS_ERR(ib_conn->fmr_pool) && ret != -ENOSYS) {
-		ib_conn->fmr_pool = NULL;
-		goto out_err;
-	} else if (ret == -ENOSYS) {
-		ib_conn->fmr_pool = NULL;
+	ib_conn->fastreg.fmr.pool = ib_create_fmr_pool(device->pd, &params);
+	if (!IS_ERR(ib_conn->fastreg.fmr.pool))
+		return 0;
+
+	/* no FMR => no need for page_vec */
+	kfree(ib_conn->fastreg.fmr.page_vec);
+	ib_conn->fastreg.fmr.page_vec = NULL;
+
+	ret = PTR_ERR(ib_conn->fastreg.fmr.pool);
+	ib_conn->fastreg.fmr.pool = NULL;
+	if (ret != -ENOSYS) {
+		iser_err("FMR allocation failed, err %d\n", ret);
+		return ret;
+	} else {
 		iser_warn("FMRs are not supported, using unaligned mode\n");
-		ret = 0;
+		return 0;
 	}
+}
+
+/**
+ * iser_free_fmr_pool - releases the FMR pool and page vec
+ */
+void iser_free_fmr_pool(struct iser_conn *ib_conn)
+{
+	iser_info("freeing conn %p fmr pool %p\n",
+		  ib_conn, ib_conn->fastreg.fmr.pool);
+
+	if (ib_conn->fastreg.fmr.pool != NULL)
+		ib_destroy_fmr_pool(ib_conn->fastreg.fmr.pool);
+
+	ib_conn->fastreg.fmr.pool = NULL;
+
+	kfree(ib_conn->fastreg.fmr.page_vec);
+	ib_conn->fastreg.fmr.page_vec = NULL;
+}
+
+/**
+ * iser_create_frwr_pool - Creates pool of fast_reg descriptors
+ * for fast registration work requests.
+ * returns 0 on success, or errno code on failure
+ */
+int iser_create_frwr_pool(struct iser_conn *ib_conn, unsigned cmds_max)
+{
+	struct iser_device	*device = ib_conn->device;
+	struct fast_reg_descriptor	*desc;
+	int i, ret;
+
+	INIT_LIST_HEAD(&ib_conn->fastreg.frwr.pool);
+	ib_conn->fastreg.frwr.pool_size = 0;
+	for (i = 0; i < cmds_max; i++) {
+		desc = kmalloc(sizeof(*desc), GFP_KERNEL);
+		if (!desc) {
+			iser_err("Failed to allocate a new fast_reg descriptor\n");
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		desc->data_frpl = ib_alloc_fast_reg_page_list(device->ib_device,
+							 ISCSI_ISER_SG_TABLESIZE + 1);
+		if (IS_ERR(desc->data_frpl)) {
+			ret = PTR_ERR(desc->data_frpl);
+			iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n", ret);
+			goto fast_reg_page_failure;
+		}
+
+		desc->data_mr = ib_alloc_fast_reg_mr(device->pd,
+						     ISCSI_ISER_SG_TABLESIZE + 1);
+		if (IS_ERR(desc->data_mr)) {
+			ret = PTR_ERR(desc->data_mr);
+			iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
+			goto fast_reg_mr_failure;
+		}
+		desc->valid = true;
+		list_add_tail(&desc->list, &ib_conn->fastreg.frwr.pool);
+		ib_conn->fastreg.frwr.pool_size++;
+	}
+
+	return 0;
+
+fast_reg_mr_failure:
+	ib_free_fast_reg_page_list(desc->data_frpl);
+fast_reg_page_failure:
+	kfree(desc);
+err:
+	iser_free_frwr_pool(ib_conn);
+	return ret;
+}
+
+/**
+ * iser_free_frwr_pool - releases the pool of fast_reg descriptors
+ */
+void iser_free_frwr_pool(struct iser_conn *ib_conn)
+{
+	struct fast_reg_descriptor *desc, *tmp;
+	int i = 0;
+
+	if (list_empty(&ib_conn->fastreg.frwr.pool))
+		return;
+
+	iser_info("freeing conn %p frwr pool\n", ib_conn);
+
+	list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.frwr.pool, list) {
+		list_del(&desc->list);
+		ib_free_fast_reg_page_list(desc->data_frpl);
+		ib_dereg_mr(desc->data_mr);
+		kfree(desc);
+		++i;
+	}
+
+	if (i < ib_conn->fastreg.frwr.pool_size)
+		iser_warn("pool still has %d regions registered\n",
+			  ib_conn->fastreg.frwr.pool_size - i);
+}
+
+/**
+ * iser_create_ib_conn_res - Queue-Pair (QP)
+ *
+ * returns 0 on success, -1 on failure
+ */
+static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
+{
+	struct iser_device	*device;
+	struct ib_qp_init_attr	init_attr;
+	int			ret = -ENOMEM;
+	int index, min_index = 0;
+
+	BUG_ON(ib_conn->device == NULL);
+
+	device = ib_conn->device;
 
 	memset(&init_attr, 0, sizeof init_attr);
 
@@ -282,9 +401,9 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn)
 		goto out_err;
 
 	ib_conn->qp = ib_conn->cma_id->qp;
-	iser_info("setting conn %p cma_id %p: fmr_pool %p qp %p\n",
+	iser_info("setting conn %p cma_id %p qp %p\n",
 		  ib_conn, ib_conn->cma_id,
-		  ib_conn->fmr_pool, ib_conn->cma_id->qp);
+		  ib_conn->cma_id->qp);
 	return ret;
 
 out_err:
@@ -293,7 +412,7 @@ out_err:
 }
 
 /**
- * releases the FMR pool and QP objects, returns 0 on success,
+ * releases the QP objects, returns 0 on success,
  * -1 on failure
  */
 static int iser_free_ib_conn_res(struct iser_conn *ib_conn)
@@ -301,13 +420,11 @@ static int iser_free_ib_conn_res(struct iser_conn *ib_conn)
 	int cq_index;
 	BUG_ON(ib_conn == NULL);
 
-	iser_info("freeing conn %p cma_id %p fmr pool %p qp %p\n",
+	iser_info("freeing conn %p cma_id %p qp %p\n",
 		  ib_conn, ib_conn->cma_id,
-		  ib_conn->fmr_pool, ib_conn->qp);
+		  ib_conn->qp);
 
 	/* qp is created only once both addr & route are resolved */
-	if (ib_conn->fmr_pool != NULL)
-		ib_destroy_fmr_pool(ib_conn->fmr_pool);
 
 	if (ib_conn->qp != NULL) {
 		cq_index = ((struct iser_cq_desc *)ib_conn->qp->recv_cq->cq_context)->cq_index;
@@ -316,21 +433,7 @@ static int iser_free_ib_conn_res(struct iser_conn *ib_conn)
 		rdma_destroy_qp(ib_conn->cma_id);
 	}
 
-	ib_conn->fmr_pool = NULL;
 	ib_conn->qp	  = NULL;
-	kfree(ib_conn->page_vec);
-
-	if (ib_conn->login_buf) {
-		if (ib_conn->login_req_dma)
-			ib_dma_unmap_single(ib_conn->device->ib_device,
-				ib_conn->login_req_dma,
-				ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
-		if (ib_conn->login_resp_dma)
-			ib_dma_unmap_single(ib_conn->device->ib_device,
-				ib_conn->login_resp_dma,
-				ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
-		kfree(ib_conn->login_buf);
-	}
 
 	return 0;
 }
@@ -694,7 +797,7 @@ int iser_reg_page_vec(struct iser_conn     *ib_conn,
 	page_list = page_vec->pages;
 	io_addr	  = page_list[0];
 
-	mem  = ib_fmr_pool_map_phys(ib_conn->fmr_pool,
+	mem  = ib_fmr_pool_map_phys(ib_conn->fastreg.fmr.pool,
 				    page_list,
 				    page_vec->length,
 				    io_addr);
@@ -709,7 +812,7 @@ int iser_reg_page_vec(struct iser_conn     *ib_conn,
 	mem_reg->rkey  = mem->fmr->rkey;
 	mem_reg->len   = page_vec->length * SIZE_4K;
 	mem_reg->va    = io_addr;
-	mem_reg->is_fmr = 1;
+	mem_reg->is_mr = 1;
 	mem_reg->mem_h = (void *)mem;
 
 	mem_reg->va   += page_vec->offset;
@@ -727,12 +830,18 @@ int iser_reg_page_vec(struct iser_conn     *ib_conn,
 }
 
 /**
- * Unregister (previosuly registered) memory.
+ * Unregister (previosuly registered using FMR) memory.
+ * If memory is non-FMR does nothing.
  */
-void iser_unreg_mem(struct iser_mem_reg *reg)
+void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
+			enum iser_data_dir cmd_dir)
 {
+	struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg;
 	int ret;
 
+	if (!reg->is_mr)
+		return;
+
 	iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h);
 
 	ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h);
@@ -742,6 +851,23 @@ void iser_unreg_mem(struct iser_mem_reg *reg)
 	reg->mem_h = NULL;
 }
 
+void iser_unreg_mem_frwr(struct iscsi_iser_task *iser_task,
+			 enum iser_data_dir cmd_dir)
+{
+	struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg;
+	struct iser_conn *ib_conn = iser_task->iser_conn->ib_conn;
+	struct fast_reg_descriptor *desc = reg->mem_h;
+
+	if (!reg->is_mr)
+		return;
+
+	reg->mem_h = NULL;
+	reg->is_mr = 0;
+	spin_lock_bh(&ib_conn->lock);
+	list_add_tail(&desc->list, &ib_conn->fastreg.frwr.pool);
+	spin_unlock_bh(&ib_conn->lock);
+}
+
 int iser_post_recvl(struct iser_conn *ib_conn)
 {
 	struct ib_recv_wr rx_wr, *rx_wr_failed;
@@ -779,7 +905,7 @@ int iser_post_recvm(struct iser_conn *ib_conn, int count)
 		rx_wr->sg_list	= &rx_desc->rx_sg;
 		rx_wr->num_sge	= 1;
 		rx_wr->next	= rx_wr + 1;
-		my_rx_head = (my_rx_head + 1) & (ISER_QP_MAX_RECV_DTOS - 1);
+		my_rx_head = (my_rx_head + 1) & ib_conn->qp_max_recv_dtos_mask;
 	}
 
 	rx_wr--;
@@ -863,7 +989,11 @@ static int iser_drain_tx_cq(struct iser_device  *device, int cq_index)
 		if (wc.status == IB_WC_SUCCESS) {
 			if (wc.opcode == IB_WC_SEND)
 				iser_snd_completion(tx_desc, ib_conn);
-			else
+			else if (wc.opcode == IB_WC_LOCAL_INV ||
+				 wc.opcode == IB_WC_FAST_REG_MR) {
+				atomic_dec(&ib_conn->post_send_buf_count);
+				continue;
+			} else
 				iser_err("expected opcode %d got %d\n",
 					IB_WC_SEND, wc.opcode);
 		} else {

+ 2 - 1
drivers/net/ethernet/mellanox/mlx4/resource_tracker.c

@@ -1909,7 +1909,8 @@ static int qp_get_mtt_size(struct mlx4_qp_context *qpc)
 	int log_rq_stride = qpc->rq_size_stride & 7;
 	int srq = (be32_to_cpu(qpc->srqn) >> 24) & 1;
 	int rss = (be32_to_cpu(qpc->flags) >> 13) & 1;
-	int xrc = (be32_to_cpu(qpc->local_qpn) >> 23) & 1;
+	u32 ts = (be32_to_cpu(qpc->flags) >> 16) & 0xff;
+	int xrc = (ts == MLX4_QP_ST_XRC) ? 1 : 0;
 	int sq_size;
 	int rq_size;
 	int total_pages;

+ 0 - 5
include/linux/mlx4/device.h

@@ -1052,11 +1052,6 @@ struct _rule_hw {
 	};
 };
 
-/* translating DMFS verbs sniffer rule to the FW API would need two reg IDs */
-struct mlx4_flow_handle {
-	u64 reg_id[2];
-};
-
 int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, u32 qpn,
 				enum mlx4_net_trans_promisc_mode mode);
 int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,

+ 126 - 2
include/rdma/ib_verbs.h

@@ -116,7 +116,8 @@ enum ib_device_cap_flags {
 	IB_DEVICE_MEM_MGT_EXTENSIONS	= (1<<21),
 	IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22),
 	IB_DEVICE_MEM_WINDOW_TYPE_2A	= (1<<23),
-	IB_DEVICE_MEM_WINDOW_TYPE_2B	= (1<<24)
+	IB_DEVICE_MEM_WINDOW_TYPE_2B	= (1<<24),
+	IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29)
 };
 
 enum ib_atomic_cap {
@@ -635,6 +636,12 @@ enum ib_qp_create_flags {
 	IB_QP_CREATE_RESERVED_END		= 1 << 31,
 };
 
+
+/*
+ * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler
+ * callback to destroy the passed in QP.
+ */
+
 struct ib_qp_init_attr {
 	void                  (*event_handler)(struct ib_event *, void *);
 	void		       *qp_context;
@@ -953,6 +960,7 @@ struct ib_ucontext {
 	struct list_head	srq_list;
 	struct list_head	ah_list;
 	struct list_head	xrcd_list;
+	struct list_head	rule_list;
 	int			closing;
 };
 
@@ -1033,7 +1041,8 @@ struct ib_qp {
 	struct ib_srq	       *srq;
 	struct ib_xrcd	       *xrcd; /* XRC TGT QPs only */
 	struct list_head	xrcd_list;
-	atomic_t		usecnt; /* count times opened, mcast attaches */
+	/* count times opened, mcast attaches, flow attaches */
+	atomic_t		usecnt;
 	struct list_head	open_list;
 	struct ib_qp           *real_qp;
 	struct ib_uobject      *uobject;
@@ -1068,6 +1077,112 @@ struct ib_fmr {
 	u32			rkey;
 };
 
+/* Supported steering options */
+enum ib_flow_attr_type {
+	/* steering according to rule specifications */
+	IB_FLOW_ATTR_NORMAL		= 0x0,
+	/* default unicast and multicast rule -
+	 * receive all Eth traffic which isn't steered to any QP
+	 */
+	IB_FLOW_ATTR_ALL_DEFAULT	= 0x1,
+	/* default multicast rule -
+	 * receive all Eth multicast traffic which isn't steered to any QP
+	 */
+	IB_FLOW_ATTR_MC_DEFAULT		= 0x2,
+	/* sniffer rule - receive all port traffic */
+	IB_FLOW_ATTR_SNIFFER		= 0x3
+};
+
+/* Supported steering header types */
+enum ib_flow_spec_type {
+	/* L2 headers*/
+	IB_FLOW_SPEC_ETH	= 0x20,
+	/* L3 header*/
+	IB_FLOW_SPEC_IPV4	= 0x30,
+	/* L4 headers*/
+	IB_FLOW_SPEC_TCP	= 0x40,
+	IB_FLOW_SPEC_UDP	= 0x41
+};
+
+#define IB_FLOW_SPEC_SUPPORT_LAYERS 4
+
+/* Flow steering rule priority is set according to it's domain.
+ * Lower domain value means higher priority.
+ */
+enum ib_flow_domain {
+	IB_FLOW_DOMAIN_USER,
+	IB_FLOW_DOMAIN_ETHTOOL,
+	IB_FLOW_DOMAIN_RFS,
+	IB_FLOW_DOMAIN_NIC,
+	IB_FLOW_DOMAIN_NUM /* Must be last */
+};
+
+struct ib_flow_eth_filter {
+	u8	dst_mac[6];
+	u8	src_mac[6];
+	__be16	ether_type;
+	__be16	vlan_tag;
+};
+
+struct ib_flow_spec_eth {
+	enum ib_flow_spec_type	  type;
+	u16			  size;
+	struct ib_flow_eth_filter val;
+	struct ib_flow_eth_filter mask;
+};
+
+struct ib_flow_ipv4_filter {
+	__be32	src_ip;
+	__be32	dst_ip;
+};
+
+struct ib_flow_spec_ipv4 {
+	enum ib_flow_spec_type	   type;
+	u16			   size;
+	struct ib_flow_ipv4_filter val;
+	struct ib_flow_ipv4_filter mask;
+};
+
+struct ib_flow_tcp_udp_filter {
+	__be16	dst_port;
+	__be16	src_port;
+};
+
+struct ib_flow_spec_tcp_udp {
+	enum ib_flow_spec_type	      type;
+	u16			      size;
+	struct ib_flow_tcp_udp_filter val;
+	struct ib_flow_tcp_udp_filter mask;
+};
+
+union ib_flow_spec {
+	struct {
+		enum ib_flow_spec_type	type;
+		u16			size;
+	};
+	struct ib_flow_spec_eth		eth;
+	struct ib_flow_spec_ipv4        ipv4;
+	struct ib_flow_spec_tcp_udp	tcp_udp;
+};
+
+struct ib_flow_attr {
+	enum ib_flow_attr_type type;
+	u16	     size;
+	u16	     priority;
+	u32	     flags;
+	u8	     num_of_specs;
+	u8	     port;
+	/* Following are the optional layers according to user request
+	 * struct ib_flow_spec_xxx
+	 * struct ib_flow_spec_yyy
+	 */
+};
+
+struct ib_flow {
+	struct ib_qp		*qp;
+	struct ib_uobject	*uobject;
+};
+
 struct ib_mad;
 struct ib_grh;
 
@@ -1300,6 +1415,11 @@ struct ib_device {
 						 struct ib_ucontext *ucontext,
 						 struct ib_udata *udata);
 	int			   (*dealloc_xrcd)(struct ib_xrcd *xrcd);
+	struct ib_flow *	   (*create_flow)(struct ib_qp *qp,
+						  struct ib_flow_attr
+						  *flow_attr,
+						  int domain);
+	int			   (*destroy_flow)(struct ib_flow *flow_id);
 
 	struct ib_dma_mapping_ops   *dma_ops;
 
@@ -2260,4 +2380,8 @@ struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device);
  */
 int ib_dealloc_xrcd(struct ib_xrcd *xrcd);
 
+struct ib_flow *ib_create_flow(struct ib_qp *qp,
+			       struct ib_flow_attr *flow_attr, int domain);
+int ib_destroy_flow(struct ib_flow *flow_id);
+
 #endif /* IB_VERBS_H */

+ 98 - 1
include/uapi/rdma/ib_user_verbs.h

@@ -43,6 +43,7 @@
  * compatibility are made.
  */
 #define IB_USER_VERBS_ABI_VERSION	6
+#define IB_USER_VERBS_CMD_THRESHOLD    50
 
 enum {
 	IB_USER_VERBS_CMD_GET_CONTEXT,
@@ -85,7 +86,9 @@ enum {
 	IB_USER_VERBS_CMD_OPEN_XRCD,
 	IB_USER_VERBS_CMD_CLOSE_XRCD,
 	IB_USER_VERBS_CMD_CREATE_XSRQ,
-	IB_USER_VERBS_CMD_OPEN_QP
+	IB_USER_VERBS_CMD_OPEN_QP,
+	IB_USER_VERBS_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD,
+	IB_USER_VERBS_CMD_DESTROY_FLOW
 };
 
 /*
@@ -123,6 +126,15 @@ struct ib_uverbs_cmd_hdr {
 	__u16 out_words;
 };
 
+struct ib_uverbs_cmd_hdr_ex {
+	__u32 command;
+	__u16 in_words;
+	__u16 out_words;
+	__u16 provider_in_words;
+	__u16 provider_out_words;
+	__u32 cmd_hdr_reserved;
+};
+
 struct ib_uverbs_get_context {
 	__u64 response;
 	__u64 driver_data[0];
@@ -684,6 +696,91 @@ struct ib_uverbs_detach_mcast {
 	__u64 driver_data[0];
 };
 
+struct ib_kern_eth_filter {
+	__u8  dst_mac[6];
+	__u8  src_mac[6];
+	__be16 ether_type;
+	__be16 vlan_tag;
+};
+
+struct ib_kern_spec_eth {
+	__u32  type;
+	__u16  size;
+	__u16  reserved;
+	struct ib_kern_eth_filter val;
+	struct ib_kern_eth_filter mask;
+};
+
+struct ib_kern_ipv4_filter {
+	__be32 src_ip;
+	__be32 dst_ip;
+};
+
+struct ib_kern_spec_ipv4 {
+	__u32  type;
+	__u16  size;
+	__u16  reserved;
+	struct ib_kern_ipv4_filter val;
+	struct ib_kern_ipv4_filter mask;
+};
+
+struct ib_kern_tcp_udp_filter {
+	__be16 dst_port;
+	__be16 src_port;
+};
+
+struct ib_kern_spec_tcp_udp {
+	__u32  type;
+	__u16  size;
+	__u16  reserved;
+	struct ib_kern_tcp_udp_filter val;
+	struct ib_kern_tcp_udp_filter mask;
+};
+
+struct ib_kern_spec {
+	union {
+		struct {
+			__u32 type;
+			__u16 size;
+			__u16 reserved;
+		};
+		struct ib_kern_spec_eth	    eth;
+		struct ib_kern_spec_ipv4    ipv4;
+		struct ib_kern_spec_tcp_udp tcp_udp;
+	};
+};
+
+struct ib_kern_flow_attr {
+	__u32 type;
+	__u16 size;
+	__u16 priority;
+	__u8  num_of_specs;
+	__u8  reserved[2];
+	__u8  port;
+	__u32 flags;
+	/* Following are the optional layers according to user request
+	 * struct ib_flow_spec_xxx
+	 * struct ib_flow_spec_yyy
+	 */
+};
+
+struct ib_uverbs_create_flow  {
+	__u32 comp_mask;
+	__u64 response;
+	__u32 qp_handle;
+	struct ib_kern_flow_attr flow_attr;
+};
+
+struct ib_uverbs_create_flow_resp {
+	__u32 comp_mask;
+	__u32 flow_handle;
+};
+
+struct ib_uverbs_destroy_flow  {
+	__u32 comp_mask;
+	__u32 flow_handle;
+};
+
 struct ib_uverbs_create_srq {
 	__u64 response;
 	__u64 user_handle;

Some files were not shown because too many files changed in this diff